Skip to content

Commit

Permalink
#18 drop feature for now
Browse files Browse the repository at this point in the history
  • Loading branch information
mrwunderbar666 committed Dec 22, 2023
1 parent 02bcffb commit 7153a36
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 243 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ Imports:
Matrix,
lubridate,
igraph,
stringi,
textreuse
stringi
Suggests:
knitr,
rmarkdown,
Expand Down
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(detect_groups)
export(detect_similar_text)
export(flag_speed_share)
export(generate_coordinated_network)
export(group_stats)
Expand All @@ -19,7 +18,6 @@ import(Matrix)
import(RcppSimdJson)
import(data.table)
import(igraph)
import(textreuse)
importFrom(data.table,':=')
importFrom(lubridate,as_datetime)
importFrom(stats,quantile)
Expand Down
172 changes: 1 addition & 171 deletions R/detect_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,174 +242,4 @@ filter_min_participation <- function(x, result, min_participation) {
result <- result[(content_id %in% filt$content_id) | (content_id_y %in% filt$content_id)]

return(result)
}


#' detect_similar_text
#'
#' This function detects coordinated cotweets, i.e. pairs of social media posts that
#' are similar in terms of their text and were posted within a short time window.
#'
#' @details Uses the [textreuse](https://cran.r-project.org/package=textreuse) package
#' to compare each post with each other and determine their text similarity. Use the
#' [reshape_tweets()] function with `intent = "cotweet"` parameter to prepare your data.
#'
#' @param x A data.table with the following columns:
#' - content_id: The ID of the content (e.g. a tweet ID)
#' - object_id: The text of the social media post
#' - account_id: The ID of the user who shared the content
#' - timestamp_share: The timestamp when the content was shared
#' @param min_participation The minimum number of actions within a specified timeframe
#' required for a user to be included in subsequent analysis (default set at two).
#' This criterion in network analysis corresponds with the concept of degree.
#' It is important to differentiate this from the frequency of repeated interactions
#' a user has with a particular other user, which is represented by edge weight.
#' The edge weight parameter is utilized in the `generate_network` function as a
#' concluding step in identifying coordinated behavior.
#' @param time_window The maximum time difference between two posts in order
#' for them to be considered coordinated cotweets (defaults to 10 seconds).
#' @param min_similarity The minimum similarity score between two posts in order
#' for them to be considered coordinated cotweets (defaults to 0.8).
#' @param similarity_function The function that is used to calculate the similarity
#' between two tweets. The default function is Jaccard Similarity (see: \link[textreuse]{jaccard_similarity}).
#' @param tokenizer The function that is used to tokenize the text of the tweets.
#' The default function is the \link[textreuse]{tokenize_ngrams} function.
#' @param minhash_seed The seed that is used to generate the minhash signatures.
#' If NULL, a random seed will be used.
#' @param minhash_n The number of minhash signatures that are used (see `textreuse` package for details).
#' @param skip_short Option passed to `textreuse`: Should short documents be skipped? Default `FALSE`
#' @return A data.table with the following columns:
#' - content_id: The ID of the first post
#' - content_id_y: The ID of the second post
#' - account_id: The ID of the user who shared the first post
#' - account_id_y: The ID of the user who shared the second post
#' - timestamp_share: The timestamp when the first post was shared
#' - timestamp_share_y: The timestamp when the second post was shared
#' - similarity_score: The similarity score between the two posts
#' - time_delta: The time difference between the two posts
#'
#' @import textreuse
#'
#' @export

detect_similar_text <- function(x,
min_participation = 2,
time_window = 10,
min_similarity = 0.8,
similarity_function = textreuse::jaccard_similarity,
tokenizer = textreuse::tokenize_ngrams,
minhash_seed = NULL,
minhash_n = 200,
skip_short=FALSE) {
a <- b <- score <- content_id <- account_id <-
timestamp_share <- similarity_score <-
time_delta <- timestamp_share_y <-
content_id_y <- account_id_y <- NULL

# Check arguments

stopifnot(is.data.table(x))
stopifnot(min_participation >= 1)
stopifnot(time_window >= 0)
stopifnot(min_similarity >= 0 && min_similarity <= 1)
stopifnot(is.function(similarity_function))
stopifnot(is.function(tokenizer))
if (!is.null(minhash_seed)) {
stopifnot(is.numeric(minhash_seed))
stopifnot(length(minhash_seed) == 1)
}
if (!is.null(minhash_n)) {
stopifnot(is.numeric(minhash_n))
stopifnot(length(minhash_n) == 1)
stopifnot(minhash_n >= 1)
}

if ("id_user" %in% colnames(x)) {
data.table::setnames(x, "id_user", "account_id")
warning("Your data contained the column `id_user`, this name is deprecated, renamed it to `account_id`")
}

# https://cran.r-project.org/web/packages/textreuse/vignettes/textreuse-minhash.html
minhash <- textreuse::minhash_generator(n = minhash_n, seed = minhash_seed)

texts <- x$object_id
names(texts) <- x$content_id

# Create a corpus object with the TextReuse package
# Caveats:
# - minimum length of n-grams cannot be specified
# due to a bug in textreuse (see: https://github.com/ropensci/textreuse/pull/80)
corpus <- textreuse::TextReuseCorpus(
text = texts,
tokenizer = tokenizer,
minhash_func = minhash,
keep_tokens = TRUE,
progress = TRUE,
skip_short = skip_short
)

buckets <- textreuse::lsh(corpus, bands = 80, progress = TRUE)
candidates <- textreuse::lsh_candidates(buckets)

result <- data.table(
textreuse::lsh_compare(
candidates,
corpus,
similarity_function,
progress = TRUE
)
)

result_a <- result[, .(a, score)]
result_b <- result[, .(b, score)]

setnames(result_a, "a", "content_id")
setnames(result_b, "b", "content_id")

cotweet_pairs_x <- x[result_a,
.(content_id, account_id, timestamp_share, score),
on = "content_id"
]
cotweet_pairs_y <- x[result_b,
.(content_id, account_id, timestamp_share),
on = "content_id"
]

setnames(
cotweet_pairs_y,
c("content_id", "account_id", "timestamp_share"),
c("content_id_y", "account_id_y", "timestamp_share_y")
)

cotweet_pairs <- cbind(cotweet_pairs_x, cotweet_pairs_y)

setnames(cotweet_pairs, "score", "similarity_score")

# filter by document similarity and time_window
coordinated_cotweets <- cotweet_pairs[
similarity_score >= min_similarity,
time_delta := timestamp_share - timestamp_share_y
][abs(time_delta) <= time_window]

# filter by minimum participation
coordinated_cotweets <-
filter_min_participation(x, coordinated_cotweets, min_participation)

# filter out loops

coordinated_cotweets <- do_remove_loops(coordinated_cotweets)

# ---------------------------
# Sort output: content_id should be older than content_id_y
# Therefore, we swap all values with positive time_delta
# and return the absolute value

coordinated_cotweets[
time_delta > 0,
c("content_id", "content_id_y", "account_id", "account_id_y") :=
.(content_id_y, content_id, account_id_y, account_id)
]
coordinated_cotweets[, time_delta := abs(time_delta)]

return(coordinated_cotweets)
}
}
68 changes: 0 additions & 68 deletions man/detect_similar_text.Rd

This file was deleted.

0 comments on commit 7153a36

Please sign in to comment.