diff --git a/docs/src/main/scala/entry.scala.md b/docs/src/main/scala/entry.scala.md new file mode 100644 index 0000000..6c44986 --- /dev/null +++ b/docs/src/main/scala/entry.scala.md @@ -0,0 +1,35 @@ + +```scala +package com.bio4j.data.enzyme + +trait AnyEntry extends Any { + + def ID: String + def subSubClassID: String + + def description: String + def alternativeNames: Seq[String] + def cofactors: Seq[String] + def catalyticActivity: String + def comments: Seq[String] +} + +sealed trait EnzymeClasses extends Any { + + def ID: String + def description: String +} + case class EnzymeClass(val ID: String, val description: String) extends EnzymeClasses + case class EnzymeSubClass(val ID: String, val description: String) extends EnzymeClasses + case class EnzymeSubSubClass(val ID: String, val description: String) extends EnzymeClasses + +``` + + + + +[test/scala/EnzymeEntries.scala]: ../../test/scala/EnzymeEntries.scala.md +[test/scala/EnzymeClasses.scala]: ../../test/scala/EnzymeClasses.scala.md +[main/scala/entry.scala]: entry.scala.md +[main/scala/flat/entry.scala]: flat/entry.scala.md +[main/scala/flat/classes.scala]: flat/classes.scala.md \ No newline at end of file diff --git a/docs/src/main/scala/flat/classes.scala.md b/docs/src/main/scala/flat/classes.scala.md new file mode 100644 index 0000000..d236e7e --- /dev/null +++ b/docs/src/main/scala/flat/classes.scala.md @@ -0,0 +1,112 @@ + +```scala +package com.bio4j.data.enzyme.flat + +import com.bio4j.data.enzyme._ + +case class ClassLine(val line: String) { + + final def asEnzymeClass: EnzymeClasses = { + + ID match { + // the order is important here + case id if (id == classID) => EnzymeClass(id, description) + case id if (id == subClassID) => EnzymeSubClass(id, description) + case id if (id == subSubClassID) => EnzymeSubSubClass(id, description) + } + } +``` + + +In the `enzclass.txt` source file the id always takes 9 characters, but it has funny empty spaces around. + + +```scala + private lazy val ID: String = + line + .take(9) + .filter(_ != ' ') + + private lazy val IDFragments: (String,String,String,String) = { + + val fragments = ID.split('.').take(4) + + (fragments(0), fragments(1), fragments(2), fragments(3)) + } + + private def classID: String = + s"${IDFragments._1}.-.-.-" + + private def subClassID: String = + s"${IDFragments._1}.${IDFragments._2}.-.-" + + private def subSubClassID: String = + s"${IDFragments._1}.${IDFragments._2}.${IDFragments._3}.-" +``` + + +We don't want to store the description with a dot at the end! + + +```scala + private lazy val description: String = + line + .drop(9) + .trim + .stripSuffix(".") +} + +case object enzymeClasses { +``` + + +The Enzyme source file `enzclass.txt` starts with: + +``` +--------------------------------------------------------------------------- + ENZYME nomenclature database + SIB Swiss Institute of Bioinformatics; Geneva, Switzerland +---------------------------------------------------------------------------- + +Description: Definition of enzyme classes, subclasses and sub-subclasses +Name: enzclass.txt +Release: 07-Sep-2016 + +---------------------------------------------------------------------------- + +1. -. -.- Oxidoreductases. +1. 1. -.- Acting on the CH-OH group of donors. +``` + +it also ends with: + +``` +---------------------------------------------------------------------------- +Copyrighted by the SIB Swiss Institute of Bioinformatics. +There are no restrictions on its use by any institutions as long as +its content is in no way modified. +---------------------------------------------------------------------------- +``` + +so we are only picking lines with a dot in the second char. + +Note that there empty lines now and then, which need to be filtered out too. + + +```scala + def fromLines(lines: Iterator[String]): Iterator[EnzymeClasses] = + lines + .filter(_.nonEmpty) + .collect { case line if(line(1) == '.') => ClassLine(line).asEnzymeClass } +} + +``` + + + + +[test/scala/EnzymeEntries.scala]: ../../../test/scala/EnzymeEntries.scala.md +[test/scala/EnzymeClasses.scala]: ../../../test/scala/EnzymeClasses.scala.md +[main/scala/entry.scala]: ../entry.scala.md +[main/scala/flat/entry.scala]: entry.scala.md +[main/scala/flat/classes.scala]: classes.scala.md \ No newline at end of file diff --git a/docs/src/main/scala/flat/entry.scala.md b/docs/src/main/scala/flat/entry.scala.md new file mode 100644 index 0000000..8fd32e9 --- /dev/null +++ b/docs/src/main/scala/flat/entry.scala.md @@ -0,0 +1,170 @@ + +```scala +package com.bio4j.data.enzyme.flat + +import com.bio4j.data.enzyme._ + +case class Entry(val lines: Seq[String]) extends AnyEntry { + + def ID: String = + id.value + + def subSubClassID: String = + id.subSubClassID + + def description: String = + de.description + + def alternativeNames: Seq[String] = + an.alternativeNames + + def cofactors: Seq[String] = + cf.cofactors + + def catalyticActivity: String = + ca.catalyticActivity + + def comments: Seq[String] = + cc.comments + + private lazy val id: ID = + new ID(linesWith(prefix = "ID").head) + + private lazy val de: DE = + DE(linesWith(prefix = "DE")) + + private lazy val an: AN = + AN(linesWith(prefix = "AN")) + + private lazy val cf: CF = + CF(linesWith(prefix = "CF")) + + private lazy val ca: CA = + CA(linesWith(prefix = "CA")) + + private lazy val cc: CC = + CC(linesWith(prefix = "CC")) + + private def linesWith(prefix: String): Seq[String] = + lines collect { case line if(line startsWith prefix) => line.stripPrefix(prefix).trim } +} + +private case class ID(val value: String) extends AnyVal { + + def subSubClassID: String = + s"${value.reverse.dropWhile(_ != '.').reverse}-" +} + +private case class DE(val lines: Seq[String]) extends AnyVal { + + def description: String = + lines + .map(_.trim.stripSuffix(".")) + .mkString(" ") +} + +private case class AN(val lines: Seq[String]) extends AnyVal { + + def alternativeNames: Seq[String] = + lines + .mkString(" ") + .split('.') +} + +private case class CF(val lines: Seq[String]) extends AnyVal { + + def cofactors: Seq[String] = + lines + .mkString("") + .split(';') + .map(_.trim.stripSuffix(".")) +} + +private case class CA(val lines: Seq[String]) extends AnyVal { + + def catalyticActivity: String = + lines.mkString(" ") +} + +private case class CC(val lines: Seq[String]) extends AnyVal { + + def comments: Seq[String] = + lines.mkString(" ") + .split("-!-") + .collect { case txt if(txt.nonEmpty) => txt.trim.stripSuffix(".") } +} + +case object entries { +``` + + +ENZYME entries file have a "header" consisting on CC lines and an end of entry // line. + + +```scala + def fromLines(lines: Seq[String]): Seq[Entry] = + entryLines(lines.dropWhile( l => l.startsWith("CC") || l.startsWith("//") )).map { Entry(_) } + + def validFromLines(lines: Seq[String]): Seq[Entry] = + fromLines(lines) filter isValid +``` + + +See ftp://ftp.expasy.org/databases/enzyme/enzuser.txt + + +```scala + private def isValid(entry: Entry): Boolean = + !( entry.description.startsWith("Deleted entry") || entry.description.startsWith("Transferred entry") ) + + @annotation.tailrec + private def entryLinesRec( + currentLine: Option[String], + linesLeft: Seq[String], + entryAcc: Seq[String], + acc: Seq[Seq[String]] + ) + : Seq[Seq[String]] = + currentLine match { + case None => acc + case Some(line) => { + + if(isEndLine(line)) + entryLinesRec( + currentLine = linesLeft.headOption, + linesLeft = if(linesLeft.isEmpty) Seq() else linesLeft.tail, + entryAcc = Seq(), + acc = acc :+ entryAcc + ) + else + entryLinesRec( + currentLine = linesLeft.headOption, + linesLeft = if(linesLeft.isEmpty) Seq() else linesLeft.tail, + entryAcc = entryAcc :+ line, + acc = acc + ) + } + } + + private def entryLines(lines: Seq[String]): Seq[Seq[String]] = + entryLinesRec( + currentLine = lines.headOption, + linesLeft = lines.tail, + entryAcc = Seq(), + acc = Seq() + ) + + private def isEndLine(line: String) = + line.startsWith("//") +} + +``` + + + + +[test/scala/EnzymeEntries.scala]: ../../../test/scala/EnzymeEntries.scala.md +[test/scala/EnzymeClasses.scala]: ../../../test/scala/EnzymeClasses.scala.md +[main/scala/entry.scala]: ../entry.scala.md +[main/scala/flat/entry.scala]: entry.scala.md +[main/scala/flat/classes.scala]: classes.scala.md \ No newline at end of file diff --git a/docs/src/test/scala/EnzymeClasses.scala.md b/docs/src/test/scala/EnzymeClasses.scala.md new file mode 100644 index 0000000..92ecb56 --- /dev/null +++ b/docs/src/test/scala/EnzymeClasses.scala.md @@ -0,0 +1,56 @@ + +```scala +package com.bio4j.data.enzyme.test + +import org.scalatest.FunSuite + +import com.bio4j.data.enzyme._ + +class ParseEnzymeClasses extends FunSuite { + + def lines = + io.Source.fromFile("enzclass.txt").getLines + + def allEnzymeClasses = flat.enzymeClasses.fromLines(lines) + + test("parse all enzyme classes") { + + allEnzymeClasses.foreach { e => + + val clazz = e + } + } +``` + + +This is unlikely to change + + +```scala + test("check first classes") { + + val firstFive = (allEnzymeClasses take 5).toList + + assert { + + firstFive === List[EnzymeClasses]( + EnzymeClass("1.-.-.-", "Oxidoreductases"), + EnzymeSubClass("1.1.-.-", "Acting on the CH-OH group of donors"), + EnzymeSubSubClass("1.1.1.-", "With NAD(+) or NADP(+) as acceptor"), + EnzymeSubSubClass("1.1.2.-", "With a cytochrome as acceptor"), + EnzymeSubSubClass("1.1.3.-", "With oxygen as acceptor") + ) + } + } +} + +``` + + + + +[test/scala/EnzymeEntries.scala]: EnzymeEntries.scala.md +[test/scala/EnzymeClasses.scala]: EnzymeClasses.scala.md +[main/scala/entry.scala]: ../../main/scala/entry.scala.md +[main/scala/flat/entry.scala]: ../../main/scala/flat/entry.scala.md +[main/scala/flat/classes.scala]: ../../main/scala/flat/classes.scala.md \ No newline at end of file diff --git a/docs/src/test/scala/EnzymeEntries.scala.md b/docs/src/test/scala/EnzymeEntries.scala.md new file mode 100644 index 0000000..5137072 --- /dev/null +++ b/docs/src/test/scala/EnzymeEntries.scala.md @@ -0,0 +1,62 @@ + +```scala +package com.bio4j.data.enzyme.test + +import org.scalatest.FunSuite + +import com.bio4j.data.enzyme._ + +class ParseEnzymeEntries extends FunSuite { + + lazy val lines = + io.Source.fromFile("enzyme.dat").getLines.toSeq + + lazy val allEntries = flat.entries.fromLines(lines) + + + test("parse all entries and access all data") { + + allEntries foreach { e => + + val id = e.ID + val subSubClassID = e.subSubClassID + val description = e.description + val alternativeNames = e.alternativeNames + val cofactors = e.cofactors + val catalyticActivity = e.catalyticActivity + val comments = e.comments + } + } + + test("check first entry") { + + val firstEntry = allEntries.head + + assert { + + ( firstEntry.ID === "1.1.1.1" ) && + ( firstEntry.subSubClassID === "1.1.1.-" ) && + ( firstEntry.description === "Alcohol dehydrogenase" ) && + ( firstEntry.alternativeNames === Seq("Aldehyde reductase") ) && + ( + firstEntry.catalyticActivity === "(1) An alcohol + NAD(+) = an aldehyde or ketone + NADH. (2) A secondary alcohol + NAD(+) = a ketone + NADH." + ) && + ( firstEntry.cofactors === Seq("Zn(2+) or Fe cation") ) && + firstEntry.comments === Seq( + "Acts on primary or secondary alcohols or hemi-acetals with very broad specificity; however the enzyme oxidizes methanol much more poorly than ethanol", + "The animal, but not the yeast, enzyme acts also on cyclic secondary alcohols" + ) + } + } +} + +``` + + + + +[test/scala/EnzymeEntries.scala]: EnzymeEntries.scala.md +[test/scala/EnzymeClasses.scala]: EnzymeClasses.scala.md +[main/scala/entry.scala]: ../../main/scala/entry.scala.md +[main/scala/flat/entry.scala]: ../../main/scala/flat/entry.scala.md +[main/scala/flat/classes.scala]: ../../main/scala/flat/classes.scala.md \ No newline at end of file