In the last years, several researchers showed that Twitter data can be used to predict real-world events, like earthquakes [1], the development of stock-market indicators [2], the outcome of political elections [3], the spread of diseases [4] or movie box-office sales [5]. Indeed studies provide some promising results that Twitter data can be successfully used for predictions, however, recently several researchers questioned both the predictive power of twitter and applied research methods [6, 7].
It seems there are several challenges which make it hard to verify whether and how well proposed methods actually work:
Given this multitude of decisions and predefined knowledge that is required to conduct the experiments combined with the difficulty to repeat experiments for other researchers, it seems in Twitter prediction research could be at risk to be influenced by the observer-expectancy effect, which means that the researcher subconciously effects the research result.
My colleague Amal Almansour from Kings College in London and I, we were particularly interested into the decisions made during Twitter Prediction research, and we just finished a literature survey and cricially analyzed 24 existing Twitter Prediction studies. In this study, we identified the different actors involved in the typical Twitter research process and their potential impact on the prediction method and respectively the prediction result.
This study is currently in the peer-review process, results will be stated here soon.
[1]
T. Sakaki, M. Okazaki, and Y. Matsuo, “Earthquake shakes Twitter users: real-time event detection by social sensors,”
Proceedings of the 19th international conference on world wide web, pp. 851-860, 2010.
[Bibtex]
@article{Sakaki2010,
abstract = {Twitter, a popular microblogging service, has received much attention recently. An important characteristic of Twitter is its real-time nature. For example, when an earthquake occurs, people make many Twitter posts (tweets) related to the earthquake, which enables detection of earthquake occurrence promptly, simply by observing the tweets. As described in this paper, we investigate the real-time interaction of events such as earthquakes in Twitter and propose an algorithm to monitor tweets and to detect a target event. To detect a target event, we devise a classifier of tweets based on features such as the keywords in a tweet, the number of words, and their context. Subsequently, we produce a probabilistic spatiotemporal model for the target event that can find the center and the trajectory of the event location. We consider each Twitter user as a sensor and apply Kalman filtering and particle filtering, which are widely used for location estimation in ubiquitous/pervasive computing. The particle filter works better than other comparable methods for estimating the centers of earthquakes and the trajectories of typhoons. As an application, we construct an earthquake reporting system in Japan. Because of the numerous earthquakes and the large number of Twitter users throughout the country, we can detect an earthquake with high probability (96\% of earthquakes of Japan Meteorological Agency (JMA) seismic intensity scale 3 or more are detected) merely by monitoring tweets. Our system detects earthquakes promptly and sends e-mails to registered users. Notification is delivered much faster than the announcements that are broadcast by the JMA.},
archivePrefix = {arXiv},
arxivId = {0808.0743v3},
author = {Sakaki, Takeshi and Okazaki, Makoto and Matsuo, Yutaka},
doi = {10.1145/1772690.1772777},
eprint = {0808.0743v3},
isbn = {9781605587998},
issn = {1605587990},
journal = {Proceedings of the 19th international conference on World wide web},
keywords = {event detection,location estimation,social sensor,twitter},
pages = {851--860},
pmid = {14716836},
title = {{Earthquake shakes Twitter users: real-time event detection by social sensors}},
url = {http://portal.acm.org/citation.cfm?doid=1772690.1772777},
year = {2010}
}
[2]
X. Zhang, H. Fuehres, and P. A. Gloor,
Predicting Stock Market Indicators Through Twitter “I hope it is not as bad as I fear”, 2011.
[Bibtex]
@misc{Zhang2011,
abstract = {Procedia - Social and Behavioral Sciences, 26 (2011) 55-62. doi:10.1016/j.sbspro.2011.10.562},
author = {Zhang, Xue and Fuehres, Hauke and Gloor, Peter A.},
booktitle = {Procedia - Social and Behavioral Sciences},
doi = {10.1016/j.sbspro.2011.10.562},
isbn = {18770428},
issn = {18770428},
pages = {55--62},
title = {{Predicting Stock Market Indicators Through Twitter “I hope it is not as bad as I fear”}},
volume = {26},
year = {2011}
}
[3]
A. Tumasjan, T. Sprenger, P. Sandner, and I. Welpe, “Predicting Elections with Twitter: What 140 Characters Reveal about Political Sentiment.,”
Icwsm, pp. 178-185, 2010.
[Bibtex]
@article{Tumasjan2010,
abstract = {Twitter is a microblogging website where users read and write millions of short messages on a variety of topics every day. This study uses the context of the German federal election to investigate whether Twitter is used as a forum for political deliberation and whether online messages on Twitter validly mirror offline political sentiment. Using LIWC text analysis software, we conducted a content analysis of over 100,000 messages containing a reference to either a political party or a politician. Our results show that Twitter is indeed used extensively for political deliberation. We find that the mere number of messages mentioning a party reflects the election result. Moreover, joint mentions of two parties are in line with real world political ties and coalitions. An analysis of the tweets’ political sentiment demonstrates close correspondence to the parties' and politicians’ political positions indicating that the content of Twitter messages plausibly reflects the offline political landscape. We discuss the use of microblogging message content as a valid indicator of political sentiment and derive suggestions for further research.},
author = {Tumasjan, Andranik and Sprenger, To and Sandner, Pg and Welpe, Im},
doi = {10.1074/jbc.M501708200},
isbn = {0894439310386},
issn = {00219258},
journal = {ICWSM},
keywords = {Twitter,data mining,elections,microblogging,politics,sentiment analysis},
pages = {178--185},
pmid = {16046402},
title = {{Predicting Elections with Twitter: What 140 Characters Reveal about Political Sentiment.}},
url = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM10/paper/viewFile/1441/1852},
year = {2010}
}
[4] A. Signorini and A. M. Segreldots, “Using Twitter to Estimate H1N1 Influenza Activity,”
9th annual conference of the \ldots, 2010.
[Bibtex]
@article{Signorini2010,
abstract = {Objective This paper describes a system that uses Twitter to estimate influenza-like illness levels by geographic region. Background Twitter is a free social networking and micro- blogging service that enables its millions of users to send and read each other's “tweets, ” or short ...},
author = {Signorini, A and Segre\ldots, A M},
journal = {9th Annual Conference of the \ldots},
title = {{Using Twitter to Estimate H1N1 Influenza Activity}},
url = {http://www.cs.uiowa.edu/~asignori/papers/using-twitter-to-estimate-H1N1-activity.pdf$\backslash$npapers://d0a46af5-98a1-4365-adb7-c8ea03c45bf3/Paper/p10221},
year = {2010}
}
[5]
S. Asur and B. Huberman, “Predicting the future with social media,” in
\ldots agent technology (wi-iat), 2010 ieee \ldots, 2010, pp. 492-499.
[Bibtex]
@inproceedings{Asur2010,
abstract = {In recent years, social media has become ubiquitous and important for social networking and content sharing. And yet, the content that is generated from these websites remains largely untapped. In this paper, we demonstrate how social media content can be used to predict real-world outcomes. In particular, we use the chatter from Twitter.com to forecast box-office revenues for movies. We show that a simple model built from the rate at which tweets are created about particular topics can outperform market-based predictors. We further demonstrate how sentiments extracted from Twitter can be further utilized to improve the forecasting power of social media.},
archivePrefix = {arXiv},
arxivId = {arXiv:1003.5699v1},
author = {Asur, Sitaram and Huberman, BA},
booktitle = {\ldots Agent Technology (WI-IAT), 2010 IEEE \ldots},
doi = {10.1109/WI-IAT.2010.63},
eprint = {arXiv:1003.5699v1},
isbn = {978-1-4244-8482-9},
issn = {03062619},
pages = {492--499},
title = {{Predicting the future with social media}},
url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5616710},
year = {2010}
}
[6] A. Jungherr, P. Jürgens, and H. Schoen, “Why the pirate party won the german election of 2009 or the trouble with predictions: a response to tumasjan, a., sprenger, to, sander, pg, & welpe, im ?predicting elections with twitter: what 140 characters reveal about political sentiment?,”
Social science computer review, vol. 30, iss. 2, pp. 229-234, 2012.
[Bibtex]
[7]
P. T. Metaxas, E. Mustafaraj, and D. Gayo-Avello, “How (Not) to predict elections,” in
Proceedings – 2011 ieee international conference on privacy, security, risk and trust and ieee international conference on social computing, passat/socialcom 2011, 2011, pp. 165-171.
[Bibtex]
@inproceedings{Metaxas2011,
abstract = {Using social media for political discourse is increasingly becoming common practice, especially around election time. Arguably, one of the most interesting aspects of this trend is the possibility of ''pulsing'' the public's opinion in near real-time and, thus, it has attracted the interest of many researchers as well as news organizations. Recently, it has been reported that predicting electoral outcomes from social media data is feasible, in fact it is quite simple to compute. Positive results have been reported in a few occasions, but without an analysis on what principle enables them. This, however, should be surprising given the significant differences in the demographics between likely voters and users of online social networks. This work aims to test the predictive power of social media metrics against several Senate races of the two recent US Congressional elections. We review the findings of other researchers and we try to duplicate their findings both in terms of data volume and sentiment analysis. Our research aim is to shed light on why predictions of electoral (or other social events) using social media might or might not be feasible. In this paper, we offer two conclusions and a proposal: First, we find that electoral predictions using the published research methods on Twitter data are not better than chance. Second, we reveal some major challenges that limit the predictability of election results through data from social media. We propose a set of standards that any theory aiming to predict elections (or other social events) using social media should follow.},
author = {Metaxas, Panagiotis T. and Mustafaraj, Eni and Gayo-Avello, Daniel},
booktitle = {Proceedings - 2011 IEEE International Conference on Privacy, Security, Risk and Trust and IEEE International Conference on Social Computing, PASSAT/SocialCom 2011},
doi = {10.1109/PASSAT/SocialCom.2011.98},
isbn = {9780769545783},
issn = {1457719312},
pages = {165--171},
title = {{How (Not) to predict elections}},
year = {2011}
}
[8]
D. J. Hand, “Principles of data mining,” in
Drug safety, 2007, pp. 621-622.
[Bibtex]
@inproceedings{Hand2007,
abstract = {Data mining is the discovery of interesting, unexpected or valuable structures in large datasets. As such, it has two rather different aspects. One of these concerns large-scale, 'global' structures, and the aim is to model the shapes, or features of the shapes, of distributions. The other concerns small-scale, 'local' structures, and the aim is to detect these anomalies and decide if they are real or chance occurrences. In the context of signal detection in the pharmaceutical sector, most interest lies in the second of the above two aspects; however, signal detection occurs relative to an assumed background model, therefore, some discussion of the first aspect is also necessary. This paper gives a lightning overview of data mining and its relation to statistics, with particular emphasis on tools for the detection of adverse drug reactions.},
author = {Hand, David J.},
booktitle = {Drug Safety},
doi = {10.2165/00002018-200730070-00010},
isbn = {026208290X},
issn = {01145916},
pages = {621--622},
pmid = {17604416},
title = {{Principles of data mining}},
volume = {30},
year = {2007}
}