diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9a09c6f --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +cpg.bin +target/ +.idea/ +/.bsp +/joern-inst +/workspace +/results +/bin +.local diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 0000000..e7cccd3 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,5 @@ +version = 3.5.1 +runner.dialect = scala3 +preset = IntelliJ +maxColumn = 120 +align.preset = true diff --git a/README.md b/README.md index 30af0ee..cc0c591 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,7 @@ A benchmarking suite for Joern --version Prints the version benchmark The benchmark to run. Available [ALL,OWASP_JAVASRC,OWASP_JAVA,SECURIBENCH_MICRO_JAVASRC,SECURIBENCH_MICRO_JAVA] -d, --dataset-dir - The dataset directory where benchmarks will be initialized and executed. Default is `./workspace`. - -o, --output The output directory to write results to. Default is `./results`. - -f, --format The output format to write results as. Default is JSON. Available [JSON,CSV,MD] + The dataset directory where benchmarks will be downloaded to. Default is `./workspace`. ``` ## Benchmarks diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..736e37f --- /dev/null +++ b/build.sbt @@ -0,0 +1,52 @@ +name := "joern-benchmarks-datasets-datasets" +ThisBuild / organization := "io.joern" +ThisBuild / scalaVersion := "3.4.1" + +// parsed by project/Versions.scala, updated by updateDependencies.sh +val cpgVersion = "1.6.11" +val joernVersion = "2.0.348" +val overflowdbVersion = "1.192" + + +libraryDependencies ++= Seq( + "com.github.pathikrit" %% "better-files" % Versions.betterFiles, + "com.github.scopt" %% "scopt" % Versions.scopt, + "org.apache.logging.log4j" % "log4j-slf4j2-impl" % Versions.log4j % Optional, + "com.lihaoyi" %% "requests" % Versions.requests, + "com.lihaoyi" %% "upickle" % Versions.upickle, + "io.joern" %% "joern-cli" % Versions.joern, + "io.joern" %% "x2cpg" % Versions.joern +) + +// mostly so that `sbt assembly` works, but also to ensure that we don't end up +// with unexpected shadowing in jar hell +excludeDependencies ++= Seq(ExclusionRule("io.shiftleft", "codepropertygraph-domain-classes_3")) + +assembly / assemblyMergeStrategy := { + case "log4j2.xml" => MergeStrategy.first + case "module-info.class" => MergeStrategy.first + case "META-INF/versions/9/module-info.class" => MergeStrategy.first + case "io/github/retronym/java9rtexport/Export.class" => MergeStrategy.first + case PathList("scala", "collection", "internal", "pprint", _) => MergeStrategy.first + case x => + val oldStrategy = (ThisBuild / assemblyMergeStrategy).value + oldStrategy(x) +} + +ThisBuild / Compile / scalacOptions ++= Seq("-feature", "-deprecation", "-language:implicitConversions") + +enablePlugins(JavaAppPackaging) + +ThisBuild / licenses := List("Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0")) + +Global / onChangedBuildSource := ReloadOnSourceChanges + +ThisBuild / resolvers ++= Seq( + Resolver.mavenLocal, + "Sonatype OSS" at "https://oss.sonatype.org/content/repositories/public", + "Atlassian" at "https://packages.atlassian.com/mvn/maven-atlassian-external", + "Gradle Releases" at "https://repo.gradle.org/gradle/libs-releases/" +) + +Compile / doc / sources := Seq.empty +Compile / packageDoc / publishArtifact := false diff --git a/install-local-joern.sh b/install-local-joern.sh new file mode 100755 index 0000000..b2baf3c --- /dev/null +++ b/install-local-joern.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -o errexit +set -o pipefail +set -o nounset + +# extract joern_version from build.sbt - parsing just like in project/Versions.scala +readonly JOERN_VERSION=$(grep 'val joernVersion = "' build.sbt | sed 's/.*"\(.*\)"/\1/') + +# get script location, use as a root dir for this script +if [ "$(uname)" = 'Darwin' ]; then + # https://unix.stackexchange.com/a/96238 + if [ "${BASH_SOURCE:-x}" != 'x' ]; then + this_script=$BASH_SOURCE + elif [ "${ZSH_VERSION:-x}" != 'x' ]; then + setopt function_argzero + this_script=$0 + elif eval '[[ -n ${.sh.file} ]]' 2>/dev/null; then + eval 'this_script=${.sh.file}' + else + echo 1>&2 "Unsupported shell. Please use bash, ksh93 or zsh." + exit 2 + fi + relative_directory=$(dirname "$this_script") + SCRIPT_ABS_DIR=$(cd "$relative_directory" && pwd) +else + SCRIPT_ABS_PATH=$(readlink -f "$0") + SCRIPT_ABS_DIR=$(dirname "$SCRIPT_ABS_PATH") +fi + +# Check required tools are installed. +check_installed() { + if ! type "$1" > /dev/null; then + echo "Please ensure you have $1 installed." + exit 1 + fi +} + +readonly JOERN_INSTALL="$SCRIPT_ABS_DIR/joern-inst" + +if [ -d "${JOERN_INSTALL}" ]; then + echo "found existing local joern installation in $JOERN_INSTALL" + echo "should we wipe it and start fresh? [y/N]" + read ANSWER + if [ ! -z $ANSWER ]; then + if [ "y" == $ANSWER ] || [ "Y" == $ANSWER ]; then + rm -rf "$JOERN_INSTALL" + fi + fi +fi + +if [ ! -d "${JOERN_INSTALL}" ]; then + echo "downloading and installing joern $JOERN_VERSION..." + check_installed "curl" + + # Fetch installer + echo "https://github.com/ShiftLeftSecurity/joern/releases/download/v$JOERN_VERSION/joern-install.sh" + curl -L "https://github.com/ShiftLeftSecurity/joern/releases/download/v$JOERN_VERSION/joern-install.sh" -o "$SCRIPT_ABS_DIR/joern-install.sh" + + # Install into `joern-inst` + chmod +x $SCRIPT_ABS_DIR/joern-install.sh + $SCRIPT_ABS_DIR/joern-install.sh --install-dir="$JOERN_INSTALL" --version=v$JOERN_VERSION --without-plugins + rm $SCRIPT_ABS_DIR/joern-install.sh + rm joern-cli.zip +fi + +readonly JAR_INSTALL_DIR=${JOERN_INSTALL}/joern-cli/lib/ + +echo "Building extension" +sbt clean stage + +echo "Installing jars into: ${JAR_INSTALL_DIR}" +rm ${JAR_INSTALL_DIR}/io.shiftleft.codepropertygraph-domain-classes* +cp target/universal/stage/lib/org.codeminers.standalone-* ${JAR_INSTALL_DIR} +cp target/universal/stage/lib/org.codeminers.*domain* ${JAR_INSTALL_DIR} + +echo "All done, you're ready to go in $JOERN_INSTALL" diff --git a/joern b/joern new file mode 120000 index 0000000..7df6070 --- /dev/null +++ b/joern @@ -0,0 +1 @@ +joern-inst/joern-cli/joern \ No newline at end of file diff --git a/joern-benchmarks-datasets b/joern-benchmarks-datasets new file mode 120000 index 0000000..5921710 --- /dev/null +++ b/joern-benchmarks-datasets @@ -0,0 +1 @@ +target/universal/stage/bin/main \ No newline at end of file diff --git a/log4j2.xml b/log4j2.xml new file mode 100644 index 0000000..4752de6 --- /dev/null +++ b/log4j2.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/project/DownloadHelper.scala b/project/DownloadHelper.scala new file mode 100644 index 0000000..9dba958 --- /dev/null +++ b/project/DownloadHelper.scala @@ -0,0 +1,48 @@ +import java.io.File +import java.net.URI +import java.nio.file.{Files, Path, Paths} + +object DownloadHelper { + val LocalStorageDir = Paths.get(".local/source-urls") + + /** Downloads the remote file from the given url if either + * - the localFile is not available, + * - or the url is different from the previously downloaded file + * - or we don't have the original url from the previously downloaded file + * We store the information about the previously downloaded urls and the localFile in `.local` + */ + def ensureIsAvailable(url: String, localFile: File): Unit = { + if (!localFile.exists() || Option(url) != previousUrlForLocalFile(localFile)) { + val localPath = localFile.toPath + Files.deleteIfExists(localPath) + + println(s"[INFO] downloading $url to $localFile") + sbt.io.Using.urlInputStream(new URI(url).toURL) { inputStream => + sbt.IO.transfer(inputStream, localFile) + } + + // persist url in local storage + val storageFile = storageInfoFileFor(localFile) + Files.createDirectories(storageFile.getParent) + Files.writeString(storageFile, url) + } + } + + private def relativePathToProjectRoot(path: Path): String = + Paths + .get("") + .toAbsolutePath + .normalize() + .relativize(path.toAbsolutePath) + .toString + + private def previousUrlForLocalFile(localFile: File): Option[String] = { + Option(storageInfoFileFor(localFile)) + .filter(Files.exists(_)) + .map(Files.readString) + .filter(_.nonEmpty) + } + + private def storageInfoFileFor(localFile: File): Path = + LocalStorageDir.resolve(relativePathToProjectRoot(localFile.toPath)) +} diff --git a/project/Projects.scala b/project/Projects.scala new file mode 100644 index 0000000..ad9e91a --- /dev/null +++ b/project/Projects.scala @@ -0,0 +1,7 @@ +import sbt.* + +object Projects { + lazy val schema = project.in(file("schema")) + lazy val domainClasses = project.in(file("domain-classes")) + lazy val schemaExtender = project.in(file("schema-extender")) +} diff --git a/project/Versions.scala b/project/Versions.scala new file mode 100644 index 0000000..35ab31d --- /dev/null +++ b/project/Versions.scala @@ -0,0 +1,28 @@ +object Versions { + val cpg = parseVersion("cpgVersion") + val joern = parseVersion("joernVersion") + val overflowdb = parseVersion("overflowdbVersion") + + val betterFiles = "3.9.2" + val log4j = "2.20.0" + val requests = "0.8.0" + val scopt = "4.1.0" + val upickle = "3.3.0" + + val jsAstGen = "3.14.0" + + private def parseVersion(key: String): String = { + val versionRegexp = s""".*val $key[ ]+=[ ]?"(.*?)"""".r + val versions: List[String] = scala.io.Source + .fromFile("build.sbt") + .getLines + .filter(_.contains(s"val $key")) + .collect { case versionRegexp(version) => version } + .toList + assert( + versions.size == 1, + s"""unable to extract $key from build.sbt, expected exactly one line like `val $key= "0.0.0-SNAPSHOT"`.""" + ) + versions.head + } +} diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..04267b1 --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.9.9 diff --git a/project/meta-build.sbt b/project/meta-build.sbt new file mode 100644 index 0000000..82d549c --- /dev/null +++ b/project/meta-build.sbt @@ -0,0 +1,4 @@ +libraryDependencies ++= Seq( + "com.github.pathikrit" %% "better-files" % "3.9.2", +) + diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..769be48 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,6 @@ +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") +addSbtPlugin("com.github.sbt" % "sbt-findbugs" % "2.0.0") +addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.1.1") +addSbtPlugin("com.github.sbt" % "sbt-native-packager" % "1.9.7") +addSbtPlugin("io.shiftleft" % "sbt-overflowdb" % "2.104") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.1") diff --git a/repl b/repl new file mode 120000 index 0000000..9289d91 --- /dev/null +++ b/repl @@ -0,0 +1 @@ +target/universal/stage/bin/repl-main \ No newline at end of file diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml new file mode 100644 index 0000000..b51d9a4 --- /dev/null +++ b/src/main/resources/log4j2.xml @@ -0,0 +1,24 @@ + + + + %d{HH:mm:ss} [%level{WARN=*, DEBUG=#, ERROR=!, TRACE=%, INFO=+}] %msg%n + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDataset.scala b/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDataset.scala new file mode 100644 index 0000000..6684bfb --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDataset.scala @@ -0,0 +1,54 @@ +package io.joern.benchmarks.datasets + +import io.joern.benchmarks.datasets.BenchmarkDataset.benchmarkConstructors +import io.joern.benchmarks.datasets.AvailableBenchmarks +import io.joern.benchmarks.datasets.runner.{ + DatasetDownloader, + IchnaeaDownloader + +// TODO: Add when implementing + +// OWASPJavaDownloader, +// SecuribenchMicroDownloader +} +import org.slf4j.LoggerFactory +import upickle.default.* + +/** The main benchmarking process. + */ +class BenchmarkDataset(config: BenchmarkDatasetConfig) { + private val logger = LoggerFactory.getLogger(getClass) + + def evaluate(): Unit = { + logger.info("Beginning evaluation") + + def runBenchmark(benchmarkRunnerCreator: BenchmarkDatasetConfig => DatasetDownloader): Unit = { + val benchmarkRunner = benchmarkRunnerCreator(config) + val benchmarkName = benchmarkRunner.benchmarkName + logger.info(s"Running $benchmarkName") + benchmarkRunner.run() + } + + if (config.benchmark == AvailableBenchmarks.ALL) { + benchmarkConstructors.values.foreach(runBenchmark) + } else { + benchmarkConstructors.get(config.benchmark).foreach(runBenchmark) + } + } +} + +object BenchmarkDataset { + + val benchmarkConstructors: Map[AvailableBenchmarks.Value, BenchmarkDatasetConfig => DatasetDownloader] = Map( + // TODO: Add when implementing +// (AvailableBenchmarks.OWASP_JAVASRC, x => new OWASPJavaDownloader(x.datasetDir)), +// (AvailableBenchmarks.OWASP_JAVA, x => new OWASPJavaDownloader(x.datasetDir)), +// ( +// AvailableBenchmarks.SECURIBENCH_MICRO_JAVASRC, +// x => new SecuribenchMicroDownloader(x.datasetDir, JavaCpgTypes.JAVA_SRC) +// ), +// (AvailableBenchmarks.SECURIBENCH_MICRO_JAVA, x => new SecuribenchMicroDownloader(x.datasetDir, JavaCpgTypes.JAVA_BYTECODE)), + (AvailableBenchmarks.ICHNAEA_JSSRC, x => new IchnaeaDownloader(x.datasetDir)) + ) + +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDatasetConfig.scala b/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDatasetConfig.scala new file mode 100644 index 0000000..ce247e4 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/BenchmarkDatasetConfig.scala @@ -0,0 +1,28 @@ +package io.joern.benchmarks.datasets + +import better.files.File + +case class BenchmarkDatasetConfig( + benchmark: AvailableBenchmarks.Value = AvailableBenchmarks.ALL, + datasetDir: File = File("workspace") +) + +object AvailableBenchmarks extends Enumeration { + val ALL = Value + val OWASP_JAVASRC = Value + val OWASP_JAVA = Value + val SECURIBENCH_MICRO_JAVASRC = Value + val SECURIBENCH_MICRO_JAVA = Value + val ICHNAEA_JSSRC = Value +} + +object JavaCpgTypes extends Enumeration { + val JAVA_SRC = Value + val JAVA_BYTECODE = Value +} + +object OutputFormat extends Enumeration { + val JSON = Value + val CSV = Value + val MD = Value +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/Main.scala b/src/main/scala/io/joern/benchmarks/datasets/Main.scala new file mode 100644 index 0000000..a4c191b --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/Main.scala @@ -0,0 +1,48 @@ +package io.joern.benchmarks.datasets + +import better.files.File +import org.slf4j.LoggerFactory +import scopt.OptionParser + +import scala.util.{Failure, Success} + +/** Example program that makes use of Joern as a library */ +object Main { + + private val logger = LoggerFactory.getLogger(getClass) + + def main(args: Array[String]): Unit = { + optionParser.parse(args, BenchmarkDatasetConfig()).map(BenchmarkDataset(_)).foreach(_.evaluate()) + } + + private val optionParser: OptionParser[BenchmarkDatasetConfig] = + new OptionParser[BenchmarkDatasetConfig]("joern-benchmark") { + + implicit val availableBenchmarksRead: scopt.Read[AvailableBenchmarks.Value] = + scopt.Read.reads(AvailableBenchmarks withName _) + + implicit val outputFormatRead: scopt.Read[OutputFormat.Value] = + scopt.Read.reads(OutputFormat withName _) + + implicit val betterFilesRead: scopt.Read[File] = + scopt.Read.reads(File.apply(_)) + + head("joern-benchmarks-datasets", ManifestVersionProvider().getVersion) + + note("A benchmark downloader tool for Joern benchmarks") + help('h', "help") + version("version").text("Prints the version") + + arg[AvailableBenchmarks.Value]("benchmark") + .text(s"The benchmark to download. Available [${AvailableBenchmarks.values.mkString(",")}]") + .required() + .action((x, c) => c.copy(benchmark = x)) + opt[File]('d', "dataset-dir") + .text("The dataset directory where benchmarks will be downloaded to. Default is `./workspace`.") + .action { (x, c) => + x.createDirectoryIfNotExists(createParents = true) + c.copy(datasetDir = x) + } + } + +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/ManifestVersionProvider.scala b/src/main/scala/io/joern/benchmarks/datasets/ManifestVersionProvider.scala new file mode 100644 index 0000000..a0c5512 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/ManifestVersionProvider.scala @@ -0,0 +1,74 @@ +package io.joern.benchmarks.datasets + +import java.io.IOException +import java.util.jar.{Attributes, Manifest} +import scala.collection.mutable + +/** Returns version information from the CPGFL JAR file's /META-INF/MANIFEST.MF file. + */ +class ManifestVersionProvider { + def getVersion: String = { + val resources = classOf[ManifestVersionProvider].getClassLoader.getResources("META-INF/MANIFEST.MF") + while ({ + resources.hasMoreElements + }) { + val url = resources.nextElement + try { + val manifest = new Manifest(url.openStream) + if (isApplicableManifest(manifest)) { + val attr = manifest.getMainAttributes + val versionInfo = new mutable.StringBuilder() + this.getVersion(attr) match { + case Some(version) => versionInfo.append("\"").append(version).append("\" ") + case None => versionInfo.append("0.0.0 ") + } + this.getBuildDate(attr) match { + case Some(date) => versionInfo.append(date) + case None => + } + this.getBuildNumber(attr) match { + case Some(buildNo) => versionInfo.append(" (build ").append(buildNo).append(")") + case None => + } + return versionInfo.toString() + } + } catch { + case _: IOException => return "" + } + } + "" + } + + private def isApplicableManifest(manifest: Manifest): Boolean = { + val attributes = manifest.getMainAttributes + "cpgfl" == this.getTitle(attributes).getOrElse("") + } + + private def get(attributes: Attributes, key: String): Option[String] = + Option(attributes.get(new Attributes.Name(key))).map(a => a.toString) + + private def getTitle(attributes: Attributes): Option[String] = get(attributes, "Implementation-Title") + + private def getVersion(attributes: Attributes): Option[String] = + get(attributes, "Implementation-Version").map(f => f.split("\\+")(0)) + + private def getBuildNumber(attributes: Attributes): Option[String] = + get(attributes, "Implementation-Version").flatMap { f => + val versionInfo = f.split("\\+") + if (versionInfo.length < 2) None + else Option(f.split("\\+")(1).replaceAll("\\d+-", "")) + } + + private def getBuildDate(attributes: Attributes): Option[String] = { + get(attributes, "Implementation-Version").flatMap { f => + val versionInfo = f.split("\\+") + if (versionInfo.length < 3) None + else Option(f.split("\\+")(2).replaceAll("-\\d+", "")) + } match { + case Some(rawDate) if rawDate.length >= 8 => + Option(Seq(rawDate.substring(0, 4), rawDate.substring(4, 6), rawDate.substring(6, 8)).mkString("-")) + case _ => None + } + } + +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/ReplMain.scala b/src/main/scala/io/joern/benchmarks/datasets/ReplMain.scala new file mode 100644 index 0000000..7b8ea25 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/ReplMain.scala @@ -0,0 +1,21 @@ +package io.joern.benchmarks.datasets + +import io.joern.console.BridgeBase +import io.joern.joerncli.console.Predefined + +/** Extend/use joern as a REPL application */ +object ReplMain extends BridgeBase { + + def main(args: Array[String]): Unit = { + run(parseConfig(args)) + } + + override protected def predefLines = { + Predefined.forInteractiveShell ++ Seq(s"import _root_.${getClass.getPackageName}.*") + } + + override protected def promptStr = "benchmark-repl" + override protected def greeting = "Welcome to the benchmark REPL!" + override protected def onExitCode = """println("goodbye!")""" + override def applicationName = "benchmarks-dataflowengineoss" +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/package.scala b/src/main/scala/io/joern/benchmarks/datasets/package.scala new file mode 100644 index 0000000..5998d98 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/package.scala @@ -0,0 +1,3 @@ +package io.joern.benchmarks + +package object datasets {} diff --git a/src/main/scala/io/joern/benchmarks/datasets/runner/DatasetDownloader.scala b/src/main/scala/io/joern/benchmarks/datasets/runner/DatasetDownloader.scala new file mode 100644 index 0000000..d709bf6 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/runner/DatasetDownloader.scala @@ -0,0 +1,25 @@ +package io.joern.benchmarks.datasets.runner + +import better.files.File + +import org.slf4j.{Logger, LoggerFactory} +import scala.util.Try + +/** A process that downloads a benchmark. + */ +trait DatasetDownloader(protected val datasetDir: File) { + + protected val logger: Logger = LoggerFactory.getLogger(getClass) + + val benchmarkName: String + + /** Create and setup the benchmark if necessary. + * + * @return + * the directory where the benchmark is set up if successful. + */ + protected def initialize(): Try[File] + + def run(): Unit +// +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/runner/FileDownloader.scala b/src/main/scala/io/joern/benchmarks/datasets/runner/FileDownloader.scala new file mode 100644 index 0000000..fc720ec --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/runner/FileDownloader.scala @@ -0,0 +1,199 @@ +package io.joern.benchmarks.datasets.runner + +import better.files.File +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream + +import java.net.{HttpURLConnection, URL} +import scala.util.{Failure, Success, Try, Using} + +sealed trait FileDownloader { this: DatasetDownloader => + + import FileDownloader.* + + /** Downloads the archive an unpacks it to the `benchmarkBaseDir`. + * + * @return + * `benchmarkBaseDir` if the operation was successful. A failure if otherwise. + */ + protected def downloadBenchmarkAndUnarchive(compressionType: CompressionTypes.Value): Try[File] + + /** Downloads the archive. + * @param ext + * and optional extension for the downloaded file. Must include the dot. + * @return + * The downloaded file if the operation was successful. A failure if otherwise. + */ + protected def downloadBenchmark(ext: Option[String] = None): Try[File] + + protected def downloadFile(url: URL, destFile: File): Try[File] = Try { + if (destFile.notExists) { + destFile.parent.createDirectoryIfNotExists(true) + var connection: Option[HttpURLConnection] = None + try { + connection = Option(url.openConnection().asInstanceOf[HttpURLConnection]) + connection.foreach { + case conn if conn.getResponseCode == HttpURLConnection.HTTP_OK => + Using.resources(conn.getInputStream, destFile.newFileOutputStream()) { (is, fos) => + val buffer = new Array[Byte](4096) + Iterator + .continually(is.read(buffer)) + .takeWhile(_ != -1) + .foreach(bytesRead => fos.write(buffer, 0, bytesRead)) + } + case conn => + throw new RuntimeException( + s"Unable to download $benchmarkName from $url. Status code ${conn.getResponseCode}" + ) + } + } finally { + connection.foreach(_.disconnect()) + } + } + destFile + } + + protected def downloadFileAndUnarchive(url: URL, destFile: File, compressionType: CompressionTypes.Value): Unit = { + compressionType match { + case CompressionTypes.ZIP => + downloadFile(url, File(s"${destFile.pathAsString}.zip")) match { + case Success(f) => + f.unzipTo(destFile) + f.delete(swallowIOExceptions = true) + case Failure(e) => throw e + } + case CompressionTypes.TGZ => + downloadFile(url, File(s"${destFile.pathAsString}.tgz")) match { + case Success(f) => + val tarball = f.unGzipTo(File(s"${destFile.pathAsString}.tar")) + tarball.unTarTo(destFile) + f.delete(swallowIOExceptions = true) + tarball.delete(swallowIOExceptions = true) + case Failure(e) => throw e + } + } + } +} + +trait SingleFileDownloader extends FileDownloader { this: DatasetDownloader => + + /** The URL to the archive. + */ + protected val benchmarkUrl: URL + + /** The name of the benchmark archive file name without extension. + */ + protected val benchmarkFileName: String + + /** The name of the benchmark directory. + */ + protected val benchmarkBaseDir: File + + override def downloadBenchmarkAndUnarchive(compressionType: CompressionTypes.Value): Try[File] = Try { + downloadFileAndUnarchive(benchmarkUrl, datasetDir, compressionType) + benchmarkBaseDir + } + + override def downloadBenchmark(ext: Option[String] = None): Try[File] = Try { + val targetFile = datasetDir / s"$benchmarkFileName${ext.getOrElse("")}" + if (!benchmarkBaseDir.exists || benchmarkBaseDir.list.forall(_.isDirectory)) { + benchmarkBaseDir.createDirectoryIfNotExists(createParents = true) + downloadFile(benchmarkUrl, targetFile) match { + case Failure(exception) => throw exception + case _ => + } + } + targetFile + } + +} + +trait MultiFileDownloader extends FileDownloader { this: DatasetDownloader => + + /** The URL to the archive. + */ + protected val benchmarkUrls: Map[String, URL] + + /** The name of the benchmark directory to download all benchmark components to. + */ + protected val benchmarkDirName: String + + /** The name of the benchmark directory. + */ + protected val benchmarkBaseDir: File + + override def downloadBenchmarkAndUnarchive(compressionType: CompressionTypes.Value): Try[File] = Try { + benchmarkUrls.foreach { case (fileName, url) => + val targetDir = benchmarkBaseDir / fileName + // TODO: Make sure dir goes to `benchmarkBaseDir / benchmarkDirName / fileName` + if (!targetDir.isDirectory) { + downloadFileAndUnarchive(url, targetDir, compressionType) + } + } + + benchmarkBaseDir + } + + /** Downloads the archive an unpacks it to the `benchmarkBaseDir`. + * + * @param ext + * and optional extension for the downloaded file. Must include the dot. + * @return + * The downloaded file if the operation was successful. A failure if otherwise. + */ + override def downloadBenchmark(ext: Option[String] = None): Try[File] = Try { + val targetDir = datasetDir / benchmarkDirName + benchmarkUrls.foreach { case (fileName, url) => + val targetFile = targetDir / s"$fileName${ext.getOrElse("")}" + if (!benchmarkBaseDir.exists || benchmarkBaseDir.list.forall(_.isDirectory)) { + benchmarkBaseDir.createDirectoryIfNotExists(createParents = true) + downloadFile(url, targetFile) + } + } + targetDir + } + + protected def zipBenchmarkDirectory(benchmarkDir: File = benchmarkBaseDir): Try[File] = Try { + val zippedDestDir = File(s"${benchmarkDir.pathAsString}.zip") + benchmarkDir.zipTo(zippedDestDir) + zippedDestDir + } +} + +/** The supported compression types. + */ +object CompressionTypes extends Enumeration { + val ZIP, TGZ = Value +} + +object FileDownloader { + + implicit class FileExt(targetFile: File) { + + /** Ungzips and untars the file to the given target directory. + * + * @param destination + * the directory to unpack to. + */ + def unTarTo(destination: File): destination.type = { + Using.resource(targetFile.newInputStream) { archiveFis => + val tarIs = new TarArchiveInputStream(archiveFis) + Iterator + .continually(tarIs.getNextEntry) + .takeWhile(_ != null) + .filter(sourceEntry => !sourceEntry.isDirectory && !sourceEntry.getName.contains("..") // naive zip slip check + ) + .foreach { sourceEntry => + val destFile = destination / sourceEntry.getName createIfNotExists (createParents = true) + Using.resource(destFile.newFileOutputStream()) { fos => + val buffer = new Array[Byte](4096) + Iterator + .continually(tarIs.read(buffer)) + .takeWhile(_ != -1) + .foreach(bytesRead => fos.write(buffer, 0, bytesRead)) + } + } + } + destination + } + } +} diff --git a/src/main/scala/io/joern/benchmarks/datasets/runner/IchnaeaDownloader.scala b/src/main/scala/io/joern/benchmarks/datasets/runner/IchnaeaDownloader.scala new file mode 100644 index 0000000..b3380e8 --- /dev/null +++ b/src/main/scala/io/joern/benchmarks/datasets/runner/IchnaeaDownloader.scala @@ -0,0 +1,96 @@ +package io.joern.benchmarks.datasets.runner + +import better.files.File +import com.github.sh4869.semver_parser.{Range, SemVer} +import io.joern.benchmarks.* +import org.slf4j.LoggerFactory +import upickle.default.* + +import java.net.{URI, URL} +import scala.util.{Failure, Success, Try, Using} + +class IchnaeaDownloader(datasetDir: File) extends DatasetDownloader(datasetDir) with MultiFileDownloader { + + private val logger = LoggerFactory.getLogger(getClass) + + override val benchmarkName = s"Ichnaea" + + private val packageNameAndVersion: Map[String, String] = Map( + "chook-growl-reporter" -> "0.0.1", + "cocos-utils" -> "1.0.0", + "gm" -> "1.20.0", + "fish" -> "0.0.0", + "git2json" -> "0.0.1", + "growl" -> "1.9.2", + "libnotify" -> "1.0.3", + "m-log" -> "0.0.1", + "mixin-pro" -> "0.6.6", + "modulify" -> "0.1.0-1", + "mongo-parse" -> "1.0.5", + "mongoosemask" -> "0.0.6", + "mongoosify" -> "0.0.3", + "node-os-utils" -> "1.0.7", + "node-wos" -> "0.2.3", + "office-converter" -> "1.0.2", + "os-uptime" -> "2.0.1", + "osenv" -> "0.1.5", + "pidusage" -> "1.1.4", + "pomelo-monitor" -> "0.3.7", + "system-locale" -> "0.1.0", + "systeminformation" -> "3.42.2" + ) + + override protected val benchmarkUrls: Map[String, URL] = packageNameAndVersion.flatMap { + case (packageName, version) => + parsePackageArtifactUrl(createNpmJsLookup(packageName, version)) match { + case Success(distUrl) => Option(packageName -> distUrl) + case Failure(exception) => + logger.error(s"Unable to determine module artifact for $packageName@$version", exception) + None + } + } + override protected val benchmarkDirName: String = "ichnaea" + override protected val benchmarkBaseDir: File = datasetDir / benchmarkDirName + + private def createNpmJsLookup(packageName: String, version: String): URL = URI( + s"https://registry.npmjs.com/$packageName/$version" + ).toURL + + private def parsePackageArtifactUrl(registryUrl: URL): Try[URL] = Try { + Using.resource(registryUrl.openStream()) { is => + read[NPMRegistryResponse](ujson.Readable.fromByteArray(is.readAllBytes())).dist.tarball + } + } + + override def initialize(): Try[File] = { + val downloadedDir = downloadBenchmarkAndUnarchive(CompressionTypes.TGZ) match { + case Success(dir) => + dir + case Failure(e) => throw e + } + + zipBenchmarkDirectory(downloadedDir) + } + + override def run(): Unit = { + initialize() match { + case Failure(exception) => + logger.error(s"Unable to initialize benchmark '$getClass'", exception) + case Success(benchmarkDir) => + logger.info(s"Finished downloading benchmark `$getClass``") + } + } +} + +implicit val urlRw: ReadWriter[URL] = readwriter[ujson.Value] + .bimap[URL]( + x => ujson.Str(x.toString), + { + case json @ (j: ujson.Str) => URI(json.str).toURL + case x => throw RuntimeException(s"Unexpected value type for URL strings: ${x.getClass}") + } + ) + +case class NPMRegistryResponse(dist: NPMDistBody) derives ReadWriter + +case class NPMDistBody(tarball: URL) derives ReadWriter diff --git a/updateDependencies.sh b/updateDependencies.sh new file mode 100755 index 0000000..03651a1 --- /dev/null +++ b/updateDependencies.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +NON_INTERACTIVE_OPTION=$1 +DEPENDENCY=$2 + +check_installed() { + if ! type "$1" > /dev/null; then + echo "Please ensure you have $1 installed." + exit 1 + fi +} + +check_installed curl + +# macOS is known to ship 'bash' version < 4 +if [[ "$OSTYPE" == "darwin"* ]]; then + BASH_VERSION=`bash --version | grep "GNU bash, version " | awk '{print $4}' | cut -d. -f1` + if [[ $BASH_VERSION -lt 4 ]]; then + echo "error: 'bash' version detected is less than 4" + if [ "$NON_INTERACTIVE_OPTION" == "--non-interactive" ] + then + echo "update 'bash' using 'brew install bash'? [Y/n]" + read ANSWER + if [ -z $ANSWER ] || [ "y" == $ANSWER ] || [ "Y" == $ANSWER ] + then + brew install bash + else + exit 1 + fi + else + echo "error: Please upgrade bash version and re-run" + exit 1 + fi + fi +fi + +# check if xmllint is installed +if type xmllint > /dev/null; then + USE_XMLLINT=1 #true +else + echo "warning: xmllint is not installed - will try with 'grep' as a fallback..." + USE_XMLLINT=0 #false +fi + +declare -A repos=( + [cpg]=https://repo1.maven.org/maven2/io/shiftleft/codepropertygraph-schema_3 + [joern]=https://repo1.maven.org/maven2/io/joern/console_3 + [overflowdb]=https://repo1.maven.org/maven2/io/shiftleft/overflowdb-core_3 +) + +function latest_version { + local NAME=$1 + local REPO_URL=${repos[$NAME]} + local MVN_META_URL=$REPO_URL/maven-metadata.xml + local CURL_PARAMS="--silent --show-error $MVN_META_URL" + + if (( $USE_XMLLINT )) + then + curl $CURL_PARAMS | xmllint --xpath "/metadata/versioning/latest/text()" - + else + curl $CURL_PARAMS | grep '' | sed 's/[ ]*\([0-9.]*\)<\/latest>/\1/' + fi +} + +function update { + local NAME=$1 + if [[ -z "${repos[$NAME]}" ]]; then + echo "error: no repo url defined for $NAME" + exit 1; + fi + + local VERSION=$(latest_version $NAME) + local SEARCH="val ${NAME}Version\([ ]*\)= .*" + local OLD_VERSION=$(grep "$SEARCH" build.sbt | sed 's/.*"\(.*\)"/\1/') + + if [ "$VERSION" == "$OLD_VERSION" ] + then + echo "$NAME: unchanged ($VERSION)" + else + local REPLACE="val ${NAME}Version\1= \"$VERSION\"" + + if [ "$NON_INTERACTIVE_OPTION" == "--non-interactive" ] + then + echo "non-interactive mode, auto-updating $NAME: $OLD_VERSION -> $VERSION" + sed -i'.bak' "s/$SEARCH/$REPLACE/" build.sbt + else + echo "update $NAME: $OLD_VERSION -> $VERSION? [Y/n]" + read ANSWER + if [ -z $ANSWER ] || [ "y" == $ANSWER ] || [ "Y" == $ANSWER ] + then + sed -i'.bak' "s/$SEARCH/$REPLACE/" build.sbt + fi + fi + fi +} + +if [ "$DEPENDENCY" == "" ]; then + update cpg + update joern + update overflowdb +else + DEPENDENCY="${DEPENDENCY#--only=}" + update $DEPENDENCY +fi + +rm -f build.sbt.bak # Remove back-up files generated by 'sed'