#18 drop feature for now

nicolarighetti · Dec 22, 2023 · 7153a36 · 7153a36
1 parent 02bcffb
commit 7153a36
Show file tree

Hide file tree

Showing 4 changed files with 2 additions and 243 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,8 +30,7 @@ Imports:
   Matrix,
   lubridate,
   igraph,
-  stringi,
-  textreuse
+  stringi
 Suggests: 
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
 export(detect_groups)
-export(detect_similar_text)
 export(flag_speed_share)
 export(generate_coordinated_network)
 export(group_stats)
@@ -19,7 +18,6 @@ import(Matrix)
 import(RcppSimdJson)
 import(data.table)
 import(igraph)
-import(textreuse)
 importFrom(data.table,':=')
 importFrom(lubridate,as_datetime)
 importFrom(stats,quantile)

diff --git a/R/detect_groups.R b/R/detect_groups.R
@@ -242,174 +242,4 @@ filter_min_participation <- function(x, result, min_participation) {
   result <- result[(content_id %in% filt$content_id) | (content_id_y %in% filt$content_id)]
 
   return(result)
-}
-
-
-#' detect_similar_text
-#'
-#' This function detects coordinated cotweets, i.e. pairs of social media posts that
-#' are similar in terms of their text and were posted within a short time window.
-#'
-#' @details Uses the [textreuse](https://cran.r-project.org/package=textreuse) package
-#' to compare each post with each other and determine their text similarity. Use the
-#' [reshape_tweets()] function with `intent = "cotweet"` parameter to prepare your data.
-#'
-#' @param x A data.table with the following columns:
-#'   - content_id: The ID of the content (e.g. a tweet ID)
-#'   - object_id: The text of the social media post
-#'   - account_id: The ID of the user who shared the content
-#'   - timestamp_share: The timestamp when the content was shared
-#' @param min_participation The minimum number of actions within a specified timeframe
-#' required for a user to be included in subsequent analysis (default set at two).
-#' This criterion in network analysis corresponds with the concept of degree.
-#' It is important to differentiate this from the frequency of repeated interactions
-#' a user has with a particular other user, which is represented by edge weight.
-#' The edge weight parameter is utilized in the `generate_network` function as a
-#' concluding step in identifying coordinated behavior.
-#' @param time_window The maximum time difference between two posts in order
-#'   for them to be considered coordinated cotweets (defaults to 10 seconds).
-#' @param min_similarity The minimum similarity score between two posts in order
-#'   for them to be considered coordinated cotweets (defaults to 0.8).
-#' @param similarity_function The function that is used to calculate the similarity
-#'   between two tweets. The default function is Jaccard Similarity (see: \link[textreuse]{jaccard_similarity}).
-#' @param tokenizer The function that is used to tokenize the text of the tweets.
-#'   The default function is the \link[textreuse]{tokenize_ngrams} function.
-#' @param minhash_seed The seed that is used to generate the minhash signatures.
-#'   If NULL, a random seed will be used.
-#' @param minhash_n The number of minhash signatures that are used (see `textreuse` package for details).
-#' @param skip_short	Option passed to `textreuse`: Should short documents be skipped? Default `FALSE`
-#' @return A data.table with the following columns:
-#'   - content_id: The ID of the first post
-#'   - content_id_y: The ID of the second post
-#'   - account_id: The ID of the user who shared the first post
-#'   - account_id_y: The ID of the user who shared the second post
-#'   - timestamp_share: The timestamp when the first post was shared
-#'   - timestamp_share_y: The timestamp when the second post was shared
-#'   - similarity_score: The similarity score between the two posts
-#'   - time_delta: The time difference between the two posts
-#'
-#' @import textreuse
-#'
-#' @export
-
-detect_similar_text <- function(x,
-                                min_participation = 2,
-                                time_window = 10,
-                                min_similarity = 0.8,
-                                similarity_function = textreuse::jaccard_similarity,
-                                tokenizer = textreuse::tokenize_ngrams,
-                                minhash_seed = NULL,
-                                minhash_n = 200,
-                                skip_short=FALSE) {
-  a <- b <- score <- content_id <- account_id <-
-    timestamp_share <- similarity_score <-
-    time_delta <- timestamp_share_y <-
-    content_id_y <- account_id_y <- NULL
-
-  # Check arguments
-
-  stopifnot(is.data.table(x))
-  stopifnot(min_participation >= 1)
-  stopifnot(time_window >= 0)
-  stopifnot(min_similarity >= 0 && min_similarity <= 1)
-  stopifnot(is.function(similarity_function))
-  stopifnot(is.function(tokenizer))
-  if (!is.null(minhash_seed)) {
-    stopifnot(is.numeric(minhash_seed))
-    stopifnot(length(minhash_seed) == 1)
-  }
-  if (!is.null(minhash_n)) {
-    stopifnot(is.numeric(minhash_n))
-    stopifnot(length(minhash_n) == 1)
-    stopifnot(minhash_n >= 1)
-  }
-
-  if ("id_user" %in% colnames(x)) {
-    data.table::setnames(x, "id_user", "account_id")
-    warning("Your data contained the column `id_user`, this name is deprecated, renamed it to `account_id`")
-  }
-
-  # https://cran.r-project.org/web/packages/textreuse/vignettes/textreuse-minhash.html
-  minhash <- textreuse::minhash_generator(n = minhash_n, seed = minhash_seed)
-
-  texts <- x$object_id
-  names(texts) <- x$content_id
-
-  # Create a corpus object with the TextReuse package
-  # Caveats:
-  # - minimum length of n-grams cannot be specified
-  #   due to a bug in textreuse (see: https://github.com/ropensci/textreuse/pull/80)
-  corpus <- textreuse::TextReuseCorpus(
-    text = texts,
-    tokenizer = tokenizer,
-    minhash_func = minhash,
-    keep_tokens = TRUE,
-    progress = TRUE,
-    skip_short = skip_short
-  )
-
-  buckets <- textreuse::lsh(corpus, bands = 80, progress = TRUE)
-  candidates <- textreuse::lsh_candidates(buckets)
-
-  result <- data.table(
-    textreuse::lsh_compare(
-      candidates,
-      corpus,
-      similarity_function,
-      progress = TRUE
-    )
-  )
-
-  result_a <- result[, .(a, score)]
-  result_b <- result[, .(b, score)]
-
-  setnames(result_a, "a", "content_id")
-  setnames(result_b, "b", "content_id")
-
-  cotweet_pairs_x <- x[result_a,
-    .(content_id, account_id, timestamp_share, score),
-    on = "content_id"
-  ]
-  cotweet_pairs_y <- x[result_b,
-    .(content_id, account_id, timestamp_share),
-    on = "content_id"
-  ]
-
-  setnames(
-    cotweet_pairs_y,
-    c("content_id", "account_id", "timestamp_share"),
-    c("content_id_y", "account_id_y", "timestamp_share_y")
-  )
-
-  cotweet_pairs <- cbind(cotweet_pairs_x, cotweet_pairs_y)
-
-  setnames(cotweet_pairs, "score", "similarity_score")
-
-  # filter by document similarity and time_window
-  coordinated_cotweets <- cotweet_pairs[
-    similarity_score >= min_similarity,
-    time_delta := timestamp_share - timestamp_share_y
-  ][abs(time_delta) <= time_window]
-
-  # filter by minimum participation
-  coordinated_cotweets <-
-    filter_min_participation(x, coordinated_cotweets, min_participation)
-
-  # filter out loops
-
-  coordinated_cotweets <- do_remove_loops(coordinated_cotweets)
-
-  # ---------------------------
-  # Sort output: content_id should be older than content_id_y
-  # Therefore, we swap all values with positive time_delta
-  # and return the absolute value
-
-  coordinated_cotweets[
-    time_delta > 0,
-    c("content_id", "content_id_y", "account_id", "account_id_y") :=
-      .(content_id_y, content_id, account_id_y, account_id)
-  ]
-  coordinated_cotweets[, time_delta := abs(time_delta)]
-
-  return(coordinated_cotweets)
-}
+}
diff --git a/man/detect_similar_text.Rd b/man/detect_similar_text.Rd