Understanding and Applying Topic Modeling Techniques in R for Social Media Analysis: A Case Study on Brexit Tweets

Here is the reformatted code and data in a format that can be used to recreate the example:

# Raw Data
raw_data <- structure(
  list(
    numRetweets = c(1L, 339L, 1L, 179L, 0L),
    numFavorites = c(2L, 178L, 2L, 152L, 0L),
    username = c("iainastewart", "DavidNuttallMP", "DavidNuttallMP", "DavidNuttallMP", "DavidNuttallMP"),
    tweet_ID = c("745870298600316929", "740663385214324737", "741306107059130368", "742477469983363076", "743146889596534785"),
    tweet_length = c(140L, 118L, 140L, 139L, 63L),
    tweet = c(
      "RT @carolemills77: Many thanks to all the @mkcouncil #EUref staff who are already in the polling stations ready to open at 7am and the Elec",
      "RT @BetterOffOut: If you agree with @DanHannanMEP, please RT. #VoteLeave #Brexit #BetterOffOut ",
      "RT @iaingartside: Out with @DavidNuttallMP @DeneVernon @CllrSueNuttall Campaigning to \"Leave\" in the EU ref in Bury Today #Brexit https://t",
      "RT @simplysimontfa: Don't be distracted by good opinion polls 4 Leave; the only way to get our country back is to maximise the Brexit vote",
      "@GrumpyPete Just a little light relief #BetterOffOut #VoteLeave"
    ),
    number_hashtags = c(1L, 3L, 1L, 1L, 2L),
    number_URLs = c(0L, 0L, 0L, 0L, 0L),
    sentiment_score = c(2L, 2L, -1L, 0L, 0L),
    stance = c("leave", "leave", "leave", "leave", "leave")
  ),
  row.names = c(NA, 5L),
  class = "data.frame"
)

# DTM
dtm <- structure(
  list(
    docid = c("745870298600316929", "740663385214324737", "741306107059130368", "742477469983363076", "743146889596534785"),
    row_id = c(1, 2, 3, 4, 5),
    doc_length = c(140L, 118L, 140L, 139L, 63L),
    doc_terms = list(
      c("EUref", "mkcouncil"),
      c("Leave", "VoteLeave"),
      c("Leave", "DavidNuttallMP", "DeneVernon"),
      c("Leave", "Brexit"),
      c("BetterOffOut")
    )
  ),
  row.names = c(1, 2, 3, 4, 5),
  class = "data.frame"
)

# EDIT: Add dput() output for raw data and DTM
dput(raw_data)
dput(dtm)

Note that the dput() output is not included here as it is too large to share in plain text. However, you can use this code to recreate the example using the dput() function.

Example usage:

# Load required libraries
library(tm)

# Create a document-term matrix (DTM) object from the raw data
dtm_object <- DocumentTermMatrix(raw_data)

# Get the tf-idf matrix for the DTM object
tfidf_matrix <- as.matrix(getTfIdf(dtm_object))

# Merge the raw data with the DTM object using merge()
merge_result <- merge(raw_data, dtm_object, by="tweet_ID", all.x=TRUE)

Please note that you need to replace raw_data and dtm_object with your actual data.


Last modified on 2024-02-16