frameToD3.R

#############################################################################
# frameToD3.R
#
# Outputs JSON file of topic model data for interactive visualizations.
# Provides two functions:
#       * frameToJSON(): given a topic model as generated by
#         'r2mallet with foreach.R', returns a hierarchical clustering of
#         topics in JSON. For each topic, includes the following metadata:
#         name, size, scaledsize, topwords, topic, rank.
#       * cotopic_edges(): given a topic model as generated by `r2mallet with
#         foreach.R`, returns weighted edges between topics and the same
#         hierarchical clustering as above.
#
# Forked from Rolf Fredheim at
# https://github.com/benmiller314/frameToD3/blob/master/frameToD3.r as
# discussed in
# http://quantifyingmemory.blogspot.com/2013/11/d3-without-javascript.html
# Many references to "my" below are his; I'll try to mark mine with "Ben:" as
# needed.
#####

source("build_plot_title.R")

frameToJSON <- function(dataset_name="noexcludes2001_2015",
                        ntopics=50,
                        subset_name="knownprograms2001_2015",  # Ben: set NULL if not using.
                        iter_index=1,   # Ben: suffix to differentiate repeat runs of same MALLET params.
                        newnames=F,     # Ben: where in the MALLET output filename does iter_index appear?
                                        # Set T if it's with the model, F if last in filename.
                                        # Gets passed into get.doctopic.grid.
                        do.plot=TRUE,   # Ben: Use this the first time to
                                        # find good cuts in the dendrogram.
                        clust.method = c("diana", "agnes"),
                                        # use agglomerative (cluster::agnes)
                                        # or divisive (cluster::diana) clustering?
                        groupVars=NULL, # Ben: If not provided by the calling
                        dataVars=NULL,  # environment, these 3 parameters
                        outfile=NULL,   # will be set to defaults.
                        use.labels=FALSE, # replace topic numbers with labels
                                        # chosen using top_topic_browser()?
                        bad.topics= NULL,
                                        # exclude non-content-bearing topics
                        auto.lines=FALSE, # should we draw all the boxes on autorun?
                        tw = NULL,      # a topic-word matrix, if it exists
                        twm = NULL,     # the distance matrix for tw, if it exists
                        dt = NULL)      # the doc-topic grid, if it exists
{

  #packages we will need:
  require(data.table)
  require(jsonlite)
  require(cluster)

  # Ben: Get topic weights for every document we have
  if(is.null(dt)) {
    if(!exists("get.doctopic.grid", mode="function")) {
          source(file="get doctopic grid.R")
    }
    dt <- get.doctopic.grid(dataset_name=dataset_name, ntopics=ntopics,
                            subset_name=subset_name, iter_index=iter_index,
                            newnames=newnames)$outputfile.dt
  }
  # Ben: Exclude any NA rows included accidentally by the index file
  dt <- na.omit(dt)


  # Ben: Optionally exclude non-content-bearing topics
  # if (is.null(bad.topics)) {
  #     if (dataset_name=="consorts" && ntopics==55 && iter_index=="") {
  #         bad.topics <- c("2", "4", "22", "24", "47", "50", "13")
  #     } else if (dataset_name=="noexcludes2001_2015" && ntopics==50 && iter_index==1) {
  #         bad.topics <- c("3", "8", "12", "15", "30", "34", "36",
  #                         # "47",
  #                         "50")
  #     }
  # }

  if(!is.null(bad.topics)) { dt <- dt[, setdiff(names(dt), bad.topics), with=F] }

  # Set parameter defaults if needed
  if(is.null(groupVars)) {
        groupVars <- c("Pub.number")    # Don't treat ID column as data
  }
  if(is.null(dataVars)) {
        dataVars <- colnames(dt)[!colnames(dt) %in% groupVars]
        # any column that's not an ID is a datapoint
  }
  if(is.null(outfile)) {
  # the desired location of the JSON file produced by the function
        outfile_slug <- build_plot_title(dataset_name=dataset_name, ntopics=ntopics,
                                         iter_index=iter_index, subset_name=subset_name,
                                         bad.topics=bad.topics,
                                         whatitis="radial_clusters_data_tw_jsd")
        outfile <- file.path(webloc, paste0(outfile_slug, ", ", match.arg(clust.method), ".json"))
  }

  #Rolf: calculate the correlation matrix

  # Ben: We'll use topic-word vectors, instead of topic-document
  if (is.null(twm)) {
      if(!exists("topic_distance_matrix", mode="function")) {
        source(file="topic_term_synonyms.R")
        source(file="topic_term_synonyms.R") # run twice to get WNHOME
      }

      t <- topic_distance_matrix(dataset_name = dataset_name,
                                 ntopics = ntopics,
                                 iter_index = iter_index,
                                 dist_method = "jensen-shannon",
                                 tw = tw,
                                 bad.topics = bad.topics)
  } else {
    t <- twm    # back compatibility
  }

  #Rolf: calculate the hierarchical cluster structure
  #from the correlation scores
  # Ben: topic_clusters() is also from topic_term_synonyms.R
  # Ben: optionally use descriptive topic labels, if we have them
  clust <- topic_clusters(t, do.plot=F, use.labels=use.labels, clust.method=match.arg(clust.method))
  # hc <- hclust(dist(t), "ward.D2")

  # Ben: convert to hclust so we can interact with the plot more easily
  hc <- (as.hclust(clust))

  # Ben: I'm making this section optional,
  # because it makes the most sense early on and has diminishing returns.
  if(do.plot) {
      #Rolf: take a look at your structure:
      # Ben: optionally save clustering figure

      # Ben: Try various cut levels until you find a set that seems
      # interesting; Then adjust the memb_ variables below, accordingly.

      main <- build_plot_title(dataset_name=dataset_name, ntopics=ntopics, iter_index=iter_index,
                               subset_name=subset_name, bad.topics=bad.topics, use.labels=use.labels)

      # restarting with topic-word-based clusters; see topic_term_synonyms.R
      if(any(class(clust) %in% "agnes") && dataset_name=="noexcludes2001_2015" &&
        ntopics==50 && length(bad.topics==9)) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ", jsd.pdf"))) }
          plot(hc, main=main)
          if(auto.lines) {
              rect.hclust(hc, k=2, border="#999900")
              abline(0.91, 0, col="#999900")
              rect.hclust(hc, k=4, border="#FF9999")
              abline(0.7, 0, col="#FF9999")
              rect.hclust(hc, k=7, border="#009900")
              abline(0.63, 0, col="#009900")
              rect.hclust(hc, k=12, border="#9999FF")
              abline(0.55, 0, col="#9999FF")
          }
          if(remake_figs) { dev.off() }

          splits <- c(2, 4, 7, 12)
      }

      # what about divisive clustering?
      if(any(class(clust) %in% "diana") && dataset_name=="noexcludes2001_2015" &&
        ntopics==50 && length(bad.topics==9)) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ", diana, jsd.pdf"))) }
          plot(hc, main=main)

           if(auto.lines) {
            rect.hclust(hc, k=2, border="#999900")
            abline(0.615, 0, col="#999900")
            rect.hclust(hc, k=3, border="#FF9999")
            abline(0.605, 0, col="#FF9999")
            rect.hclust(hc, k=4, border="#009999")
            abline(0.595, 0, col="#009999")
            rect.hclust(hc, k=9, border="#009900")
            abline(0.545, 0, col="#009900")
            rect.hclust(hc, k=11, border="#9999FF")
            abline(0.55, 0, col="#9999FF")
            rect.hclust(hc, k=21, border="#99FFFF")
          }
          if(remake_figs) { dev.off() }

          splits <- c(2,3,4,9,11,21)
      }

      # with 5 bad.topics removed
      if(dataset_name=="consorts" && ntopics==55 && length(bad.topics) == 5)
      {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
            plot(hc, main=main)
            if(auto.lines) {
                abline(1.35, 0, col="#99FF99")
                rect.hclust(hc, k=32, border="#99FF99")
                abline(1.55, 0, col="#009900")
                rect.hclust(hc, k=16, border="#009900")
                abline(1.7, 0, col="#FF9999")
                rect.hclust(hc, k=12, border="#FF9999")
                abline(1.85, 0, col="#9999FF")
                rect.hclust(hc, k=7, border="#9999FF")
                abline(1.95, 0, col="#990099")
                rect.hclust(hc, k=6, border="#990099")
                abline(2.33, 0, col="#009999")
                rect.hclust(hc, k=4, border="#009999")
                abline(3.37, 0, col="#999900")
                rect.hclust(hc, k=2, border="#999900")
            }
          if(remake_figs) { dev.off() }


      # with 7 bad.topics removed and the real consortium-program subset
      } else if(dataset_name=="consorts" && subset_name=="realconsorts" && ntopics==55 &&
              length(bad.topics)==7)
      {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
              plot(hc, main=main)
              if(auto.lines) {
                  rect.hclust(hc, k=2, border="#99FF99")
                  rect.hclust(hc, k=4, border="#999900")
                  rect.hclust(hc, k=6, border="#FF9999")
                  abline(1.931, 0, col="#FF9999")
                  rect.hclust(hc, k=9, border="#009999")
                  abline(1.79, 0, col="#009999")
                  rect.hclust(hc, k=11, border="#990099")
                  abline(1.7, 0, col="#990099")
                  rect.hclust(hc, k=19, border="#009900")
                  abline(1.54, 0, col="#009900")
              }
          if(remake_figs) { dev.off() }


      # with 7 bad.topics removed but the full consorts set
      } else if(dataset_name=="consorts" && ntopics==55 &&
              length(bad.topics)==7)
      {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
              plot(hc, main=main)
              if(auto.lines) {
                  abline(1.45, 0, col="#99FF99")
                  rect.hclust(hc, k=21, border="#99FF99")
                  abline(1.73, 0, col="#009900")
                  rect.hclust(hc, k=11, border="#009900")
                  abline(1.955, 0, col="#FF9999")
                  rect.hclust(hc, k=6, border="#FF9999")
                  rect.hclust(hc, k=4, border="#009999")
                  rect.hclust(hc, k=2, border="#999900")
              }
          if(remake_figs) { dev.off() }


      # TO DO: Find splits for model with 150 topics

     } else if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==150) {
         if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }

            if(FALSE) {       # for screenshots.
                subplot = 23  # for subplot, use a number of topics from below.
                              # start the pdf device, using a filename indicating lines or boxes
                pdf(file=file.path(imageloc, paste0(main, "--", subplot, " subtopics--line.pdf")))
                # pdf(file=file.path(imageloc, paste0(main, "--", subplot, " subtopics--boxes.pdf")))
                plot(hc, main=main, sub=paste(subplot, "topic clusters"), cex=0.3, xlab="")
                              # **insert/run the relevant lines from below here**
                              # then come back up to turn the pdf device off
                dev.off()
            }

          plot(hc, main=main, cex=0.3)
          if(auto.lines) {
                abline(4.1, 0, col="#99FF99")    # seafoam
                rect.hclust(hc, k=2, border="#99FF99")
                abline(3.14, 0, col="#009900")    # green
                rect.hclust(hc, k=3, border="#009900")
                abline(2.34, 0, col="#990099")  # purple
                rect.hclust(hc, k=8, border="#990099")
                abline(2, 0, col="#009999")   # teal
                rect.hclust(hc, k=15, border="#009999")
                abline(1.81, 0, col="#FF0099")   # pink
                rect.hclust(hc, k=23, border="#FF0099")
                abline(1.5, 0, col="#000099")   # dark blue
                rect.hclust(hc, k=50, border="#000099")
          }
          if(remake_figs) { dev.off() }


      } else if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==23) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
          plot(hc, main=main, cex=0.6)
          if(auto.lines) {
              abline(2, 0, col="#99FF99")            # seafoam
              rect.hclust(hc, k=15, border="#99FF99")
              abline(1.81, 0, col="#009900")          # green
              rect.hclust(hc, k=23, border="#009900")
              abline(1.5, 0, col="#009999")             # teal
              rect.hclust(hc, k=50, border="#009999")
          }
          if(remake_figs) { dev.off() }


      # use the following for CCCC 2019
      } else if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==50 && iter_index==1) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
          plot(hc, main=main)
          if(auto.lines) {
              rect.hclust(hc, k=2, border="#99FF99")    # seafoam
              abline(2.95, 0, col="#99FF99")
              rect.hclust(hc, k=4, border="#009900")    # green
              abline(2.17, 0, col="#009900")
              rect.hclust(hc, k=6, border="#990099")    # purple
              abline(1.92, 0, col="#990099")
              rect.hclust(hc, k=8, border="#009999")   # teal
              abline(1.65, 0, col="#009999")
              rect.hclust(hc, k=12, border="#000099")   # dark blue
              abline(1.51, 0, col="#000099")
              rect.hclust(hc, k=16, border="#FF0099")   # pink
              abline(1.46, 0, col="#FF0099")
              rect.hclust(hc, k=20, border="#999900")   # yellow
              abline(1.42, 0, col="#999900")
          }
          if(remake_figs) { dev.off() }

          splits <- c(2, 4, 6, 8, 12, 16, 20)

      } else if(dataset_name=="noexcludes2001_2015" && subset_name=="realconsorts2001_2015"
                && ntopics==50 && iter_index==1) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
          plot(hc, main=main)
          if(auto.lines) {
              rect.hclust(hc, k=2, border="#99FF99")    # seafoam
              abline(2.88, 0, col="#99FF99")
              rect.hclust(hc, k=3, border="#009900")    # green
              abline(2.31, 0, col="#009900")
              rect.hclust(hc, k=7, border="#990099")    # purple
              abline(1.91, 0, col="#990099")
              rect.hclust(hc, k=9, border="#009999")   # teal
              abline(1.69, 0, col="#009999")
              rect.hclust(hc, k=14, border="#000099")   # dark blue
              abline(1.52, 0, col="#000099")
              rect.hclust(hc, k=20, border="#FF0099")   # pink
              abline(1.43, 0, col="#FF0099")
              rect.hclust(hc, k=24, border="#999900")   # yellow
              abline(1.37, 0, col="#999900")
          }
          if(remake_figs) { dev.off() }

          splits=c(2, 3, 7, 9, 14, 20, 24)

      } else if(dataset_name=="noexcludes2001_2015" && subset_name=="knownprograms2001_2015"
                && ntopics==50 && iter_index==1) {
        if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
        plot(hc, main=main)
        if(auto.lines) {
          rect.hclust(hc, k=2, border="#99FF99")    # seafoam
          abline(2.88, 0, col="#99FF99")
          rect.hclust(hc, k=3, border="#009900")    # green
          abline(2.31, 0, col="#009900")
          rect.hclust(hc, k=7, border="#990099")    # purple
          abline(1.91, 0, col="#990099")
          rect.hclust(hc, k=9, border="#009999")   # teal
          abline(1.69, 0, col="#009999")
          rect.hclust(hc, k=14, border="#000099")   # dark blue
          abline(1.52, 0, col="#000099")
          rect.hclust(hc, k=20, border="#FF0099")   # pink
          abline(1.43, 0, col="#FF0099")
          rect.hclust(hc, k=24, border="#999900")   # yellow
          abline(1.37, 0, col="#999900")
        }
        if(remake_figs) { dev.off() }

        splits=c(2, 3, 7, 9, 14, 20, 24)

      } else if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==60) {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
          plot(hc, main=main)
          if(auto.lines) {
              abline(2.95, 0, col="#99FF99")
              rect.hclust(hc, k=2, border="#99FF99")    # seafoam
              abline(2.165, 0, col="#009900")
              rect.hclust(hc, k=5, border="#009900")    # green
              abline(1.885, 0, col="#990099")
              rect.hclust(hc, k=7, border="#990099")    # purple
              abline(1.735, 0, col="#009999")
              rect.hclust(hc, k=12, border="#009999")   # teal
              abline(1.65, 0, col="#000099")
              rect.hclust(hc, k=15, border="#000099")   # dark blue
              abline(1.53, 0, col="#FF0099")
              rect.hclust(hc, k=22, border="#FF0099")   # pink
              abline(1.48, 0, col="#999900")
              rect.hclust(hc, k=24, border="#999900")   # yellow
              abline(1.33, 0, col="#00FF99")
              rect.hclust(hc, k=40, border="#00FF99")   # sky blue
          }
        if(remake_figs) { dev.off() }
      } else {
          if(remake_figs) { pdf(file=file.path(imageloc, paste0(main, ".pdf"))) }
              plot(hc, main=main)
          if(remake_figs) { dev.off() }

          # If we're plotting, we probably wanted to locate splits.
          # Exit the function here.
          message("Exiting function.")
          message("Using abline() and rect.hclust(), try various cut levels
                    until you find a set that seems promising.")
          return(hc) ## TO DO: instead of exiting, can we pause somehow but preserve the environment?
                   ## We still need hc to do the cuts.
      }
  }   # end of if(do.plot)

  #Rolf: now we split the data based on membership structure. We will take
  #four levels: (basically this means we will calculate which group each
  #variable belongs in for different levels of the tree structure)
  #
  ## Ben: so, essentially, we're going to look at plot(hc) and decide what
  ## the major branch points are, then cut the tree to find group assignments
  ## above/below those splits. NB cutree() also allows us to split the tree
  ## at specific heights (on the y axis of that plot), if we don't want to
  ## count the groups.

if(dataset_name=="noexcludes2001_2015" && ntopics==50 && length(bad.topics==9)) {
  splits <- c(2, 4, 6, 8, 12, 16, 20)
}

    # Ben: splits for consorts with 55 topics (i.e. including bad.topics)
if(dataset_name=="consorts" && ntopics==55 && is.null(bad.topics)) {
  splits <- c(2, 5, 10, 22, 55)
}

    # Ben: splits for consorts with 50 topics
    # (i.e. 5 bad.topics removed for bad OCR or boilerplate)
if(dataset_name=="consorts" && ntopics==55 && length(bad.topics) == 5) {
  splits <- c(2, 4, 6, 7, 12, 16, 32)
}

    # Ben: splits for realconsorts subset with 48 topics
    # (i.e. 7 bad.topics removed for bad OCR, boilerplate, or non-English lang)
if(dataset_name=="consorts" && subset_name=="realconsorts" && ntopics==55 && length(bad.topics) == 7) {
  splits <- c(2, 4, 6, 9, 11, 19)
}

    # Ben: splits for consorts with 48 topics (i.e. 7 bad.topics removed
    # for bad OCR, boilerplate, or non-English lang)
if(dataset_name=="consorts" && is.null(subset_name) && ntopics==55 && length(bad.topics) == 7) {
  splits <- c(2, 4, 6, 11, 21)
}

    # Splits for model with 150 topics
if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==150 && is.null(bad.topics)) {
    splits <- c(2, 3, 8, 15, 23, 50)
}

if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==60 && is.null(bad.topics)) {
  splits <- c(2, 5, 7, 12, 15, 22, 24, 40)
}

 # use the following for CCCC 2019
 if(dataset_name=="noexcludes2001_2015" && is.null(subset_name) && ntopics==50 && iter_index==1) {
      splits <- c(2, 4, 6, 8, 12, 16, 20)
  }

  # ***this version should work to assign group membership for any given set of splits***
  # get_cluster_names is from topic_term_synonyms.R
  for (i in splits) {
      assign(paste0("memb", i), as.character(cutree(hc, k = i)))
      # assign(paste0("k", i), get_cluster_names(rect.hclust(hc, k=i)))
  }

  # Make note of group names for later;
  # same operation for all numbers of bad.topics
  membVars <- paste0("memb", splits)

  # Ben: get topic labels, which you've composed elsewhere using
  # 'top docs per topic.R'
  if(!exists("get_topic_labels", mode="function")) {
        source(file="get topic labels.R")
  }
  topic.labels.dt <- get_topic_labels(dataset_name=dataset_name, ntopics=ntopics,
                                      subset_name=subset_name, iter_index=iter_index)
    # str(topic.labels.dt)

  # Ben: Update the vocabulary for those top-lists
  if(!exists("tfidf.for.topics", mode="function")) {
        source(file="tfidf_for_topics.R")
  }
  status_quo <- remake_figs
  remake_figs <- F
  topwords <- tfidf.for.topics(tw=tw)$topN
  remake_figs <- status_quo
  rm(status_quo)

  # Ben: Also add top dissertation titles
  filename <- file.path(imageloc, paste0("top_titles_per_topic-", dataset_name, "k", ntopics, subset_name, iter_index, ".csv"))

  if (!file.exists(filename))  {
      # if there isn't yet a file with top titles for each topic, create it now
      if(!exists("find_topic_titles", mode="function")) {
          source(file="find topic titles.R")
      }
      titles_all <- find_topic_titles(dataset_name=dataset_name, ntopics=ntopics,
                                      subset_name=subset_name, iter_index=iter_index)
  } else {
      # if the file does exist, just load it now
      titles_all <- read.csv(filename)
  }

  # exclude non-content-bearing topics
  if(!is.null(bad.topics)) {
      topic.labels.dt <- topic.labels.dt[!Topic %in% bad.topics]
      titles_all <- titles_all[!titles_all$topic %in% bad.topics,]
      topwords <- topwords[!(topic %in% bad.topics),]
  }

  #Rolf: Now put this information into a table, together with the labels and
  #the order in which they should appear:
  # Ben adds: use gsub to remove spaces (this seems to help the d3
  # scrollover); add topic number to aid in merging w/ edge table later

  b <- data.table(sapply(membVars, FUN=function(var){
                         get(as.character(var)) }
                        ),
        label = gsub(' ', '_', topic.labels.dt[, Label]),
        topic = topic.labels.dt[, Topic],
        topwords = topwords$by_prob,
        itfwords = topwords$by_tfitf,
        rank = topic.labels.dt[, Rank],
        order = hc$order,
        titles = titles_all[, "top_titles"]
        )

  #Rolf: We might want to know the size of each node. Let's add that.
  # Ben: for a topic model, this will find the total %-point contribution of
  # the topic to all docs; that means we could divide by number of docs to
  # scale to [0,1], but no need: it's proportional.
  b$size <- colSums(dt[,c(dataVars),with=F])
  b$scaledsize <- b$size/nrow(dt)


  #Rolf: sort the data so it aligns with the structure calculated using
  #hclust()
  setkey(b,order)

  #Rolf: drop the order variable:
  b[,order:=NULL]

  # Ben: Save this data table to a csv for later inspection; this table will
  # also be returned by the function.
  if(remake_figs) {
    if(! is.null(subset_name)) {
        filename <- file.path(imageloc, paste0("topic clusters - ", dataset_name,
                           "k", ntopics, "_", iter_index, "--", subset_name, ", ",
                           length(bad.topics), " bad topics removed.csv"))
    } else {
        filename <- file.path(imageloc, paste0("topic clusters - ", dataset_name,
                        "k", ntopics, "_", iter_index, ", ",
                        length(bad.topics), " bad topics removed.csv"))
    }
    write.csv(b, filename)
  } else {
    # print(b)
  }


## Hierarchical Clustering of Topics by Similarity
  #Rolf: we define a function which will create a nested list in JSON format:
  #From here:
  #http://stackoverflow.com/questions/12818864/how-to-write-to-json-with-
  #children-from-r
  # Ben: but see also, now, http://bit.ly/1jXAC5M

  makeList <- function(x) {
    if (any(names(x) %in% membVars) && ncol(x)>2) {
      listSplit<-split(x[-1],x[1],drop=T)
      grp <- names(x)[1]
      grpnum <- substr(grp, 5, nchar(grp))
      names(listSplit) <- paste0(names(listSplit), "of", grpnum)
      lapply(names(listSplit), function(y){
                list(name=y,children=makeList(listSplit[[y]])) }
            )
    } else {
      lapply(seq(nrow(x[1])), function(y){
        list(name=x[,"label"][y],
             size=x[,"size"][y],
             scaledsize=x[,"scaledsize"][y],
             topwords=x[,"topwords"][y],
             itfwords=x[,"itfwords"][y],
             topic=x[,"topic"][y],
             rank=x[,"rank"][y],
             titles=x[,"titles"][y])
        })
    } # end of if-else
  } # end of makeList

  #Rolf: This will not work on a data.table
  b.df <- data.frame(b)
  out <- makeList(b.df)
  # str(out)
  # toJSON(out)

  # Have a look at the structure this creates:
  if(autorun) { print (head(out)) }

  #Rolf: Basically we have made a list of lists containing the information
  #from the tree diagram. Finally we put everything into a list, convert this
  #to json format and save it as data.json
  jsonOut<-toJSON(list(name="1of1",children=out), digits=6, pretty=TRUE)

  #Rolf: We use the cat function here, because in some cases you may want to
  #add separators, or a prefix and suffix to make the formatting just right
    # Ben adds: to avoid overwriting, only save this file if remake_figs is
    # TRUE
    if(remake_figs) {
        cat(jsonOut,file=outfile)
    }

  # Ben: Return the data.table for use in edge bundling, below
  return(b)
}

########
# Hierarchical Edge Bundling between (possibly unrelated) topics. This one's
#  all Ben, but trying to reconstruct a figure like Rolf's
#  http://fredheir.github.io/dendroArcs/pages/hierarc/page.html.
#
#  Plan: From the combined hierarchical data structure above (named `b`), for
#  each topic (row):
#     1) pull out the "name" field that combines location in hierarchy with
#         label information
#     2) loop through the targets, and find the "name" corresponding to
#         that target
#     3) convert to JSON.
########

cotopic_edges <- function(dataset_name="noexcludes2001_2015",
              ntopics=50,
              subset_name="knownprograms2001_2015",
              iter_index=1,
              level=0.13,       # topic must constitute how much of each doc?
              min=3,            # how many times must a pair of topics co-occur?
              outfile=NULL,     # if null, defaults to naming the parameters
              cull.bad.topics = TRUE, # exclude non-content-bearing topics?
              bad.topics= NULL, # if so, which?
              clust.method=c("diana","agnes"),
              tw=NULL    # topic-word grid, if we have one, makes this much faster
              )
{

    ## set default parameters if needed
  
    if(cull.bad.topics) {
        if(is.null(bad.topics))
            if(dataset_name=="noexcludes2001_2015" && ntopics==50 && iter_index==1) {
                bad.topics <- c("3", "8", "12", "15", "30", "34", "36", "47", "50")
            } else {
                warning("cotopic_edges(): can't cull bad.topics if they're unknown")
            }
    } else {
        if(!is.null(bad.topics)) {
            warning("cotopic_edges(): bad.topics specified, but cull.bad.topics is False.",
                    " If you want to include all topics, set bad.topics <- c()")
        }
    }

    # the desired location of the JSON file produced by the function
    if(is.null(outfile)) {
        if (!exists("build_plot_title", mode="function")) {
          source(file="build_plot_title.R")
        }
        outfile_slug <- build_plot_title(dataset_name = dataset_name,
                                         ntopics = ntopics,
                                         iter_index = iter_index,
                                         subset_name = subset_name,
                                         bad.topics = bad.topics,
                                         whatitis = "edge_data",
                                         for.filename = T)
        outfile <- file.path(webloc,
                             paste0(outfile_slug,
                             "--min", min,
                             "--", level*100, "pct",
                             "--", match.arg(clust.method), ".json"))
    }


    # get co-occurring topics, for hierarchical edge bundling
    if(!exists("get.cotopics")) { source(file.path(sourceloc, "cotopics.R")) }
    cotopics <- get.cotopics(dataset_name=dataset_name, ntopics=ntopics,
                             subset_name=subset_name, iter_index=iter_index,
                             level=level, min=min, bad.topics=bad.topics)

    # that gives one-directional links; to ensure symmetry, flip source and
    # target and combine.
    cotopics_flip <- data.table( source=cotopics$target,
                                 target=cotopics$source,
                                 weight=cotopics$weight)
    cotopics_both <- rbind(cotopics, cotopics_flip)

    # aggregate all edges by source
    edges <- cotopics_both[, .SD[, list(
                                    "targets"=paste(target, collapse=","),
                                    "weights"=paste(weight, collapse=","))
                                ], by=source]
    setkey(edges, source)
    # head(edges)

    # TO DO: allow for directed edges, where a source node means 
    # the topic is top-ranked in its diss. 
    # See get_top_topics() in 'variation of topic proportions.R'.
    
    # Bring in the node table
    b <- frameToJSON(dataset_name, ntopics, subset_name, iter_index, bad.topics=bad.topics, do.plot=F, tw=tw, clust.method=clust.method)
    setkey(b, topic)
    # head(b)

    # merge
    b <- edges[b, ]
    # str(b)

    # Create a "name" column that collapses the hierarchical structure and
    # topic label, as per
    # http://fredheir.github.io/dendroArcs/pages/hierarc/test.JSON This is
    # what the d3 edge bundling code in packages.js will parse to recreate
    # the hierarchy

    # first re-derive `membVars` from the names of b that include "memb"
    membVars <- names(b)[grep("memb", names(b))]

    b$name <- sapply(1:nrow(b), FUN=function(x) {
                    paste(b[x, c(membVars, "label"), with=F], collapse=".")
                    })

    # We're going to build our JSON for edge bundling with a name, size, and
    # (to take advantage of Mike Bostock's
    # http://mbostock.github.io/d3/talk/20111116/packages.js) 
    # we'll call the edges "imports"

    # We start empty...
    edge_bund <- data.table(name=rep("NA", max(b$source)),
                            topic=0,
                            size=0.0,
                            imports=list("NA")
                            )

    # ... and then build up
    for (i in b$source) {
        edge_bund[i, "topic"] <- i
        edge_bund[i, "name"] <- b[source %in% i, name]
        edge_bund[i, "rank"] <- b[source %in% i, rank]
        edge_bund[i, "size"] <- b[source %in% i, size]
        edge_bund[i, "scaledsize"] <- b[source %in% i, scaledsize] * 100
        edge_bund[i, "topwords"] <- b[source %in% i, topwords]
        edge_bund[i, "itfwords"] <- b[source %in% i, itfwords]
        edge_bund[i, "titles"] <- b[source %in% i, titles]

        # extract targets' topic numbers
        imports <- lapply(strsplit(b[source %in% i, targets], ","),
            FUN=function(x) {
                x <- as.integer(x)          # convert from string to numeric
                b[source %in% x, name]      # match topic numbers to sources
            })

        # weights correspond by position in array
        weights <- lapply(strsplit(b[source %in% i, weights], ","),
            FUN=function(x) {
                x <- as.integer(x)
            })

        if(!anyNA(imports[[1]])) {          # list edges if there are any
            edge_bund[i, "imports"] <- list(imports)
            edge_bund[i, "weights"] <- list(weights)
        } else {                            # otherwise...
            # ... give it a loop back to itself
            edge_bund[i, "imports"] <- list(b[source %in% i, name])

            # and call the weight "1"
            edge_bund[i, "weights"] <- list(1)
        }
    }

    # Now remove any empty rows introduced by cutting bad.topics
    edge_bund <- edge_bund[!(name %in% "NA")]

    jsonEdge <- toJSON(edge_bund, pretty=TRUE)

    if(remake_figs) {
      if(file.exists(outfile)) {
        message("File already exists:\n ", outfile, "\n")
        a <- readline("Overwrite file? (y/n)   ")
        if (startsWith(tolower(a), "y")) {
          cat(jsonEdge, file=outfile)
          message("File saved to ", outfile)
        } else {
          message("File not saved, but returning edge_data json to stdout.")
        }
      } else {
        cat(jsonEdge, file=outfile)
      }
    }

    invisible(jsonEdge)
}


if(FALSE) {
    remake_figs
    # debug(frameToJSON)
    frameToJSON(do.plot=T)
    frameToJSON(dataset_name="noexcludes", subset_name="realconsorts", iter_index="")
    frameToJSON(ntopics=150, bad.topics=NULL)
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name=NULL, ntopics=150, iter_index=6, bad.topics = NULL)
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name=NULL, ntopics=150, iter_index=1, bad.topics = NULL)
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name=NULL, ntopics=60, iter_index=4, bad.topics = NULL)
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name="consorts2001_2015", ntopics=60, iter_index=4, bad.topics = NULL)
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name="realconsorts2001_2015", ntopics=60, iter_index=4, bad.topics = NULL)
    frameToJSON(do.plot=F, dataset_name="noexcludes2001_2015", subset_name=NULL, ntopics=50, iter_index=1, bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15))
    frameToJSON(do.plot=T, dataset_name="noexcludes2001_2015", subset_name="realconsorts2001_2015", ntopics=50, iter_index=1, bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15), use.labels=T)
    frameToJSON(do.plot=F, dataset_name="noexcludes2001_2015", 
                subset_name="knownprograms2001_2015", ntopics=50, iter_index=1, 
                bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15), use.labels=T, 
                
                tw=tw)        # see get_topic_word_grid.R
    
    frameToJSON(do.plot=F,
                dataset_name="noexcludes2001_2015",
                subset_name="knownprograms2001_2015",
                ntopics=50,
                iter_index=1,
                bad.topics = c(3, 12, 50,
                               # 47,
                               34, 36, 30, 8, 15),
                use.labels=T,
                tw=tw,        # see get_topic_word_grid.R
                clust.method = "agnes")


    # 12% determined by `variation of topic proportions.R` to include nearly
    # all primary topics and 3/4 of secondary topics for *consorts*;
    # see `Variation of Topic Proportions, Top 10 Topics per Document.pdf`
    remake_figs=T
    edges <- cotopic_edges(level=0.05,
                  min=1,
                  dataset_name="noexcludes2001_2015",
                  ntopics=50,
                  iter_index=1,
                  subset_name="knownprograms2001_2015",
                  bad.topics = c(3, 12, 50,
                                 # 47,   # spanish language
                                 34, 36, 30, 8, 15),
                  tw = tw,
                  clust.method="agnes")
    cotopic_edges(level=0.12, min=2)
    cotopic_edges(level=0.12, min=3, dataset_name="noexcludes2001_2015", ntopics=50, iter_index=1, bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15))
    cotopic_edges(level=0.12, min=4)
    cotopic_edges(level=0.12, min=5)

    # 11% determined by `variation of topic proportions.R` to include nearly
    # all primary topics and 3/4 of secondary topics for *realconsorts*;
    # see `Variation of Topic Proportions, Top 10 Topics per Document.pdf`
    cotopic_edges(level=0.11, min=1, tw=tw, dataset_name="noexcludes2001_2015", subset_name="knownprograms2001_2015", ntopics=50, iter_index=1, bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15))
    
    # 13% for *knownprograms2001_2015*: 
    remake_figs=T
    cotopic_edges(tw=tw,
                  level=0.13,
                  # level=0.05,
                  # level=0.22,
                  # min=3, 
                  min=12,
                  dataset_name="noexcludes2001_2015",
                  subset_name="knownprograms2001_2015",
                  ntopics=50, iter_index=1,
                  bad.topics = c(3, 12, 50, 47, 34, 36, 30, 8, 15),
                  clust.method="agnes")
    remake_figs=F
}