This dataset contains a collection of keyphraseness values for phrases extracted from Wikipedia articles. The keyphraseness value Q(s) of a phrase s is the probability that the phrase appears in a Wikipedia article as being anchor text. In total, 4,342,732 phrases are extracted from the English Wikipedia dump created on January 30, 2010. In this release, we remove the 184,979 phrases containing non-English characters. Among the remaining 4,157,753 phrases, about 1.9 million phrases have non-zero keyphraseness values. Each line in the text file is a mapping: [phrase],[keyphraseness value] (e.g., jackie_chan, 0.9509918319719953). This dataset has been used in the following 3 papers. Please refer to the papers for more details about the dataset and how the keyphraseness values can be used in various tasks. This dataset is released solely for research purposes. Please cite at least one of the following 3 papers if you use this dataset in your research. @inproceedings{LiSigir2013, author = {Li, Chenliang and Sun, Aixin and Weng, Jianshu and He, Qi}, title = {Exploiting Hybrid Contexts for Tweet Segmentation}, booktitle = {Proceedings of the 36th International ACM SIGIR Conference on Research and Development in Information Retrieval}, series = {SIGIR '13}, year = {2013}, isbn = {978-1-4503-2034-4}, location = {Dublin, Ireland}, pages = {523--532}, numpages = {10}, url = {http://doi.acm.org/10.1145/2484028.2484044}, doi = {10.1145/2484028.2484044}, acmid = {2484044}, publisher = {ACM}, address = {New York, NY, USA}, keywords = {named entity recognition, tweet, tweet segmentation, twitter}, } @inproceedings{LiCikm2012, author = {Li, Chenliang and Sun, Aixin and Datta, Anwitaman}, title = {Twevent: Segment-based Event Detection from Tweets}, booktitle = {Proceedings of the 21st ACM International Conference on Information and Knowledge Management}, series = {CIKM '12}, year = {2012}, isbn = {978-1-4503-1156-4}, location = {Maui, Hawaii, USA}, pages = {155--164}, numpages = {10}, url = {http://doi.acm.org/10.1145/2396761.2396785}, doi = {10.1145/2396761.2396785}, acmid = {2396785}, publisher = {ACM}, address = {New York, NY, USA}, keywords = {event detection, microblogging, tweet segmentation, twitter}, } @inproceedings{LiSigir2012, author = {Li, Chenliang and Weng, Jianshu and He, Qi and Yao, Yuxia and Datta, Anwitaman and Sun, Aixin and Lee, Bu-Sung}, title = {TwiNER: Named Entity Recognition in Targeted Twitter Stream}, booktitle = {Proceedings of the 35th International ACM SIGIR Conference on Research and Development in Information Retrieval}, series = {SIGIR '12}, year = {2012}, isbn = {978-1-4503-1472-5}, location = {Portland, Oregon, USA}, pages = {721--730}, numpages = {10}, url = {http://doi.acm.org/10.1145/2348283.2348380}, doi = {10.1145/2348283.2348380}, acmid = {2348380}, publisher = {ACM}, address = {New York, NY, USA}, keywords = {named entity recognition, tweets, twitter, web n-gram, wikipedia}, }