From 3f7a6f011823c78dca6da47f81e4a6e7a14d80cb Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Wed, 27 May 2020 01:11:52 +0000
Subject: [PATCH 01/30] Add documentation

---
 Support/Text/WordSeg/Alphabet.swift          | 19 ++++++--
 Support/Text/WordSeg/CharacterSequence.swift | 49 +++++++++++++++++++-
 Support/Text/WordSeg/Lexicon.swift           | 40 ++++++++--------
 3 files changed, 84 insertions(+), 24 deletions(-)
diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift
index 52d6672ef7e..273513436df 100644
--- a/Support/Text/WordSeg/Alphabet.swift
+++ b/Support/Text/WordSeg/Alphabet.swift
@@ -14,20 +14,29 @@
 
 import TensorFlow
 
-/// Alphabet maps from characters in a string to Int32 representations.
-///
-/// Note: we map from String in order to support multi-character metadata sequences such as </s>.
+/// A collection that maps individual characters to an integer representation.
 ///
 /// In Python implementations, this is sometimes called the character vocabulary.
+///
+/// - Note: We map from String in order to support multi-character metadata sequences such as `</s>`.
 public struct Alphabet {
+  /// A type whose instances represent a character.
   public typealias Element = String
 
+  /// A one-to-one mapping between a set of characters and a unique integer.
   public var dictionary: BijectiveDictionary<String, Int32>
 
+  /// A marker denoting the end of a sequence.
   public let eos: Int32
+
+  /// A marker denoting the end of a word.
   public let eow: Int32
+
+  /// A marker used for padding inside a sequence.
   public let pad: Int32
 
+  /// Creates an instance containing a mapping from `letters` to unique
+  /// integers, including markers.
   public init<C: Collection>(_ letters: C, eos: String, eow: String, pad: String)
   where C.Element == Character {
     self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...))
@@ -42,6 +51,8 @@ public struct Alphabet {
     self.dictionary[pad] = self.pad
   }
 
+  /// Creates an instance containing a mapping from `letters` to unique
+  /// integers, including markers.
   public init<C: Collection>(_ letters: C, eos: String, eow: String, pad: String)
   where C.Element == Element {
     self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...))
@@ -56,8 +67,10 @@ public struct Alphabet {
     self.dictionary[pad] = self.pad
   }
 
+  /// A count of the characters in the alphabet, including markers.
   public var count: Int { return dictionary.count }
 
+  /// Accesses the `key`th element.
   public subscript(key: String) -> Int32? {
     return dictionary[key]
   }
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index 0a39c0b31c5..d5190d28c71 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -14,16 +14,25 @@
 
 import TensorFlow
 
-/// An Int32-based representation of a string to be used with the WordSeg model.
+/// A sequence of characters represented by integers.
 public struct CharacterSequence: Hashable {
+  /// A collection of integers representing a sequence of characters.
   public let characters: [Int32]
+  /// A marker denoting the end of the sequence.
   private let eos: Int32
 
+  /// Creates an instance without meaningful contents.
   public init(_debug: Int) {
     self.characters = []
     self.eos = -1
   }
 
+  /// Creates a sequence from `string`, using the integers from `alphabet`,
+  /// appended with the end of sequence marker.
+  ///
+  /// - Parameter alphabet: character to integer mapping.
+  /// - Parameter appendingEoSTo: string to be converted to a sequence of
+  ///   integers.
   public init(alphabet: Alphabet, appendingEoSTo string: String) throws {
     var characters = [Int32]()
     characters.reserveCapacity(string.count + 1)
@@ -37,33 +46,71 @@ public struct CharacterSequence: Hashable {
     self.init(alphabet: alphabet, characters: characters)
   }
 
+  /// Creates a sequence from `characters` and sets the end of sequence marker
+  ///  from `alphabet`.
+  ///
+  /// - Parameter alphabet: character to integer mapping.
+  /// - Parameter characters: sequence of integers with a terminal end of
+  ///   sequence marker.
   private init(alphabet: Alphabet, characters: [Int32]) {
     self.characters = characters
     self.eos = alphabet.eos
   }
 
+  /// Creates a sequenxe from `characters` and sets the end of sequence marker
+  /// from `alphabet`.
+  ///
+  /// - Parameter alphabet: character to integer mapping.
+  /// - Parameter characters: sequence of integers with a terminal end of
+  ///   sequence marker.
   public init(alphabet: Alphabet, characters: ArraySlice<Int32>) {
     self.characters = [Int32](characters)
     self.eos = alphabet.eos
   }
 
+  /// Accesses the `index`th character.
   public subscript(index: Int32) -> Int32 {
     return characters[Int(index)]
   }
 
+  /// Accesses characters within `range`.
   public subscript(range: Range<Int>) -> ArraySlice<Int32> {
     return characters[range]
   }
 
+  /// Count of characters in the sequence, including the end marker.
   public var count: Int { return characters.count }
+  /// The last character in the sequence, i.e. the end marker.
   public var last: Int32? { return characters.last }
+  /// TODO: what's happening here?
   public var tensor: Tensor<Int32> {
     Tensor<Int32>([self.eos] + characters[0..<characters.count - 1])
   }
 }
 
 extension CharacterSequence: CustomStringConvertible {
+  /// A string representation of the collection of integers representing the character sequence.
   public var description: String {
     "\(characters)"
   }
 }
+
+
+/// An error that can be encountered when processing characters.
+public enum CharacterErrors: Error {
+  case unknownCharacter(character: Character, index: Int, sentence: String)
+  case nonUtf8Data
+}
+
+extension CharacterErrors: CustomStringConvertible {
+  /// A description of the error with all included details.
+  public var description: String {
+    switch self {
+    case let .unknownCharacter(character, index, sentence):
+      return
+        "Unknown character '\(character)' encountered at index \(index) while converting sentence \"\(sentence)\" to a character sequence."
+    case .nonUtf8Data:
+      return "Non-UTF8 data encountered."
+    }
+  }
+}
diff --git a/Support/Text/WordSeg/Lexicon.swift b/Support/Text/WordSeg/Lexicon.swift
index 2c602454238..6f706b1a186 100644
--- a/Support/Text/WordSeg/Lexicon.swift
+++ b/Support/Text/WordSeg/Lexicon.swift
@@ -14,22 +14,39 @@
 
 import TensorFlow
 
-/// A mapping from characters to logical words.
+/// A collection that maps character sequences to logical words.
 ///
-/// In Python implementations, this is sometimes called the String Vocabulary (which is in
-/// contrast with the character vocabulary which maps the alphabet to Int32's).
+/// In Python implementations, this is sometimes called the string vocabulary
+/// (in contrast to the character vocabulary or `Alphabet`, which maps
+/// characters to integers).
 public struct Lexicon {
+  /// A type whose instances represent a sequence of characters.
   public typealias Element = CharacterSequence
 
+  /// A one-to-one mapping between a sequence of characters and unique
+  /// integers.
   // TODO(marcrasi): if the value is not used to construct Tensor, switch to Int
   public var dictionary: BijectiveDictionary<CharacterSequence, Int32>
 
+  /// A count of unique logical words in the lexicon.
   public var count: Int { return dictionary.count }
 
+  /// Creates an instance containing a mapping from `sequences` to unique
+  /// integers.
+  ///
+  /// - Parameter sequences: character sequences to compose the lexicon.
   public init<C: Collection>(_ sequences: C) where C.Element == Element {
     self.dictionary = .init(zip(sequences, 0...))
   }
 
+  /// Creates an instance containing a mapping from `sequences` to unique
+  /// integers, using `alphabet`. Sequences are truncated at `maxLength` and
+  /// only those occurring `minFreq` times are included.
+  ///
+  /// - Parameter sequences: character sequences to compose the lexicon.
+  /// - Parameter alphabet: all characters contained in `sequences`.
+  /// - Parameter maxLength: sequence length at which truncation occurs.
+  /// - Parameter minFreq: minimum required occurrence of each sequence.
   public init(
     from sequences: [CharacterSequence],
     alphabet: Alphabet,
@@ -59,20 +76,3 @@ public struct Lexicon {
     self.init(vocab)
   }
 }
-
-public enum CharacterErrors: Error {
-  case unknownCharacter(character: Character, index: Int, sentence: String)
-  case nonUtf8Data
-}
-
-extension CharacterErrors: CustomStringConvertible {
-  public var description: String {
-    switch self {
-    case let .unknownCharacter(character, index, sentence):
-      return
-        "Unknown character '\(character)' encountered at index \(index) while converting sentence \"\(sentence)\" to a character sequence."
-    case .nonUtf8Data:
-      return "Non-UTF8 data encountered."
-    }
-  }
-}

From 2d42404a122eaff45547f406664ebdaaa440488c Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 26 May 2020 21:19:36 -0400
Subject: [PATCH 02/30] Lint

---
 Support/Text/WordSeg/CharacterSequence.swift | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index d5190d28c71..2eb4f6714bb 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -95,7 +95,6 @@ extension CharacterSequence: CustomStringConvertible {
   }
 }
 
-
 /// An error that can be encountered when processing characters.
 public enum CharacterErrors: Error {
   case unknownCharacter(character: Character, index: Int, sentence: String)

From bdf71a9fbba4ac0fa1a5f0c7c9179f9da5f2f1be Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Wed, 27 May 2020 02:07:54 +0000
Subject: [PATCH 03/30] Add dataset files

---
 Datasets/WordSeg/WordSegDataset.swift | 55 +++++++++++++++++++++++++++
 Datasets/WordSeg/WordSegRecord.swift  |  8 ++++
 2 files changed, 63 insertions(+)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 260188173fe..93476f74488 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -15,21 +15,37 @@
 import Foundation
 import ModelSupport
 
+/// A collection of raw and processed text used for training and validation
+/// of word segmentation models.
 public struct WordSegDataset {
+  /// A collection of text used for training.
   public let training: [WordSegRecord]
+  /// A collection of text used for testing.
   public private(set) var testing: [WordSegRecord]?
+  /// A collection of text used for validation.
   public private(set) var validation: [WordSegRecord]?
+  /// The set of characters found in all included texts.
   public let alphabet: Alphabet
 
+  /// Details used for downloading source data.
   private struct DownloadDetails {
+    /// The location of the archive.
     var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")!
+    /// The basename of the archive.
     var archiveFileName = "seg"
+    /// The extension of the archive.
     var archiveExtension = "zip"
+    /// The path to the test source.
     var testingFilePath = "br/br-text/te.txt"
+    /// The path to the training source.
     var trainingFilePath = "br/br-text/tr.txt"
+    /// The path to the validation source.
     var validationFilePath = "br/br-text/va.txt"
   }
 
+  /// Returns a list of records parsed from `data` in UTF8.
+  ///
+  /// - Parameter data: text in UTF8 format.
   private static func load(data: Data) throws -> [String] {
     guard let contents: String = String(data: data, encoding: .utf8) else {
       throw CharacterErrors.nonUtf8Data
@@ -37,6 +53,10 @@ public struct WordSegDataset {
     return load(contents: contents)
   }
 
+  /// Separates `contents` into a collection of strings by newlines, trimming
+  /// leading and trailing whitespace and excluding blank lines.
+  ///
+  /// - Parameter contents: text to be separated by newline.
   private static func load(contents: String) -> [String] {
     var strings = [String]()
 
@@ -48,6 +68,15 @@ public struct WordSegDataset {
     return strings
   }
 
+  /// Returns an alphabet composed of all characters found in `training` and
+  /// `otherSequences`.
+  ///
+  /// - Parameter training: full text of the training data.
+  /// - Parameter otherSequences: optional full text of the validation and
+  ///   test data.
+  /// - Parameter eos: text to be used as the end of sequence marker.
+  /// - Parameter eow: text to be used as the end of word marker.
+  /// - Parameter pad: text to be used as the padding marker.
   private static func makeAlphabet(
     datasets training: [String],
     _ otherSequences: [String]?...,
@@ -73,6 +102,10 @@ public struct WordSegDataset {
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
 
+  /// Creates a collection of records to be used with the WordSeg model.
+  ///
+  /// - Parameter dataset: text to be converted.
+  /// - Parameter alphabet: set of all characters used in `dataset`.
   private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
     -> [WordSegRecord]
   {
@@ -84,6 +117,12 @@ public struct WordSegDataset {
           alphabet: alphabet, appendingEoSTo: trimmed))
     }
   }
+
+  /// Returns a collection of records to be used with the WordSeg model, or
+  /// `nil` if `dataset` is empty.
+  ///
+  /// - Parameter dataset: text to be converted.
+  /// - Parameter alphabet: set of all characters used in `dataset`.
   private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
     -> [WordSegRecord]?
   {
@@ -94,6 +133,8 @@ public struct WordSegDataset {
     return nil
   }
 
+  /// Creates an instance containing `WordSegRecords` from the default
+  /// location.
   public init() throws {
     let downloadDetails = DownloadDetails()
     let localStorageDirectory: URL = FileManager.default.temporaryDirectory
@@ -119,6 +160,11 @@ public struct WordSegDataset {
       testing: testingFilePath)
   }
 
+  /// Creates an instance containing `WordSegRecords` from the given files.
+  ///
+  /// - Parameter training: path to the file containing training data.
+  /// - Parameter validation: path to the file containing validation data.
+  /// - Parameter testing: path to the file containing test data.
   public init(
     training trainingFile: String,
     validation validationFile: String? = nil,
@@ -151,6 +197,11 @@ public struct WordSegDataset {
     self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
+  /// Creates an instance containing `WordSegRecords` from the given data.
+  ///
+  /// - Parameter training: contents of the training data.
+  /// - Parameter validation: contents of the validation data.
+  /// - Parameter testing: contents of the test data.
   public init(
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
   )
@@ -172,6 +223,10 @@ public struct WordSegDataset {
     self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
+  /// Downloads and unpacks the source archive if it does not exist locally.
+  ///
+  /// - Parameter directory: local directory to store files.
+  /// - Parameter downloadDetails: where to find the source archive.
   private static func downloadIfNotPresent(
     to directory: URL, downloadDetails: DownloadDetails
   ) {
diff --git a/Datasets/WordSeg/WordSegRecord.swift b/Datasets/WordSeg/WordSegRecord.swift
index d0049a2a2c2..8445477a1eb 100644
--- a/Datasets/WordSeg/WordSegRecord.swift
+++ b/Datasets/WordSeg/WordSegRecord.swift
@@ -14,10 +14,18 @@
 
 import ModelSupport
 
+/// A sequence of text for use in word segmentation.
 public struct WordSegRecord {
+  /// A raw, unprocessed sequence of text.
   public let plainText: String
+  /// A sequence of text in numeric form, derived from `plainText`.
   public let numericalizedText: CharacterSequence
 
+  /// Creates an instance containing both raw and processed forms of a
+  /// sequence of text.
+  ///
+  /// - Parameter plainText: raw, unprocessed text.
+  /// - Parameter numericalizedText: processed text in numeric form.
   public init(plainText: String, numericalizedText: CharacterSequence) {
     self.plainText = plainText
     self.numericalizedText = numericalizedText

From d44b5bf3177273a119fac0a5391d5a91cce85c95 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 28 May 2020 16:56:52 +0000
Subject: [PATCH 04/30] Add lattice

---
 Models/Text/WordSeg/Lattice.swift   | 107 +++++++++++++++++++++++++---
 Support/Text/WordSeg/Alphabet.swift |   2 -
 2 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift
index 3a2babdc25d..cda03ba5218 100644
--- a/Models/Text/WordSeg/Lattice.swift
+++ b/Models/Text/WordSeg/Lattice.swift
@@ -23,23 +23,40 @@ import TensorFlow
   import Glibc
 #endif
 
-/// Lattice
-///
-/// Represents the lattice used by the WordSeg algorithm.
+/// A structure used for scoring all possible segmentations of a character
+/// sequence. The path with the best score provides the most likely
+/// segmentation at inference.
 public struct Lattice: Differentiable {
-  /// Edge
+  /// Represents a word.
   ///
-  /// Represents an Edge
+  /// At each character position, an edge is constructed for every possible
+  /// segmentation of the preceding portion of the sequence.
   public struct Edge: Differentiable {
+    /// The node position immediately preceding this edge.
     @noDerivative public var start: Int
+    /// The node position immediately following this edge.
     @noDerivative public var end: Int
+    /// The characters composing a word.
     @noDerivative public var string: CharacterSequence
+    /// The log likelihood of this segmentation.
     public var logp: Tensor<Float>
 
-    // expectation
+    /// The expected score for this segmentation.
     public var score: SemiRing
+    /// The expected total score for this segmentation.
     public var totalScore: SemiRing
 
+    /// Creates an edge for `sentence` between `start` and `end` node
+    /// positions. Sets the log probability to `logp` and uses this value to
+    /// calculate the score. Sums the score with `previous` to determine the
+    /// total score.
+    ///
+    /// - Parameter start: the position of the start node.
+    /// - Parameter end: the position of the end node.
+    /// - Parameter sentence: the character sequence.
+    /// - Parameter logp: the log likelihood.
+    /// - Parameter previous: the total score of the preceding edge.
+    /// - Parameter order: the power of the length penalty.
     @differentiable
     init(
       start: Int, end: Int, sentence: CharacterSequence, logp: Tensor<Float>,
@@ -58,6 +75,15 @@ public struct Lattice: Differentiable {
       self.totalScore = self.score * previous
     }
 
+    /// Creates an edge for `string` between `start` and `end` node
+    /// positions. Sets the log probability, score, and total score.
+    ///
+    /// - Parameter start: the position of the start node.
+    /// - Parameter end: the position of the end node.
+    /// - Parameter string: the character sequence.
+    /// - Parameter logp: the log likelihood.
+    /// - Parameter score: the current score.
+    /// - Parameter totalScore: the total score.
     @differentiable
     public init(
       start: Int, end: Int, string: CharacterSequence, logp: Tensor<Float>,
@@ -72,17 +98,32 @@ public struct Lattice: Differentiable {
     }
   }
 
-  /// Node
+  /// Represents a word boundary. When a lattice is built, a start node is
+  /// created, followed by one for every character in the sequence,
+  /// representing every potential boundary.
   ///
-  /// Represents a node in the lattice
+  /// - Note: Scores are only meaningful in relation to incoming edges and the
+  ///   start node has no incoming edges.
   public struct Node: Differentiable {
+    /// The incoming edge with the highest score.
     @noDerivative public var bestEdge: Edge?
+    /// The score of the best incoming edge.
     public var bestScore: Float = 0.0
+    /// All incoming edges.
     public var edges = [Edge]()
+    /// A composite score of all incoming edges.
     public var semiringScore: SemiRing = SemiRing.one
 
+    /// Creates an empty instance.
     init() {}
 
+    /// Creates a node preceded by `bestEdge`. Stores `bestScore` and
+    /// `semiringScore`. Sets incoming edges to `edges`.
+    ///
+    /// - Parameter bestEdge: the best incoming edge.
+    /// - Parameter bestScore: the score of the best incoming edge.
+    /// - Parameter edges: the incoming edges.
+    /// - Parameter semiringScore: the composite score of all incoming edges.
     @differentiable
     public init(
       bestEdge: Edge?, bestScore: Float, edges: [Edge],
@@ -94,20 +135,24 @@ public struct Lattice: Differentiable {
       self.semiringScore = semiringScore
     }
 
+    /// Calculates the semiring score by summing the total score of all edges.
     @differentiable
     func computeSemiringScore() -> SemiRing {
       // TODO: Reduceinto and +=
       edges.differentiableMap { $0.totalScore }.sum()
     }
 
+    /// Calculates the current semiring score and sets `semiringScore`.
     @differentiable
     mutating func recomputeSemiringScore() {
       semiringScore = computeSemiringScore()
     }
   }
 
+  /// An ordered collection of nodes.
   var positions: [Node]
 
+  /// Accesses the node at the `index`th position.
   @differentiable
   public subscript(index: Int) -> Node {
     get { return positions[index] }
@@ -121,16 +166,28 @@ public struct Lattice: Differentiable {
     // _modify { yield &positions[index] }
   }
 
+  /// Creates an empty instance with a start node, followed by `count` nodes.
+  ///
+  /// - Parameter count: the length of the lattice, e.g. number of characters
+  ///   in the sequence.
   init(count: Int) {
     positions = Array(repeating: Node(), count: count + 1)
   }
 
+  /// Creates an instance with the nodes in `positions`.
+  ///
+  /// - Parameter positions: the nodes composing the lattice.
   public init(positions: [Node]) {
     self.positions = positions
   }
 
+  /// Returns a set of edges with the best total score. Traversing this path
+  /// produces a segmented version of `sentence`.
+  ///
+  /// - Parameter sentence: the text to be segmented.
   public mutating func viterbi(sentence: CharacterSequence) -> [Edge] {
-    // Forwards pass
+    // Forward pass
+    // Starts at 1 since the 0 node has no incoming edges.
     for position in 1...sentence.count {
       var bestScore = -Float.infinity
       var bestEdge: Edge!
@@ -145,7 +202,7 @@ public struct Lattice: Differentiable {
       self[position].bestEdge = bestEdge
     }
 
-    // Backwards
+    // Backward pass
     var bestPath: [Edge] = []
     var nextEdge = self[sentence.count].bestEdge!
     while nextEdge.start != 0 {
@@ -157,6 +214,11 @@ public struct Lattice: Differentiable {
     return bestPath.reversed()
   }
 
+  /// Returns the plain text encoded in `path`, e.g. the segmentation of the
+  /// full character sequence.
+  ///
+  /// - Parameter path: a lattice path.
+  /// - Parameter alphabet: the alphabet used in path creation.
   public static func pathToPlainText(path: [Edge], alphabet: Alphabet) -> String {
     var plainText = [String]()
     for edge in path {
@@ -171,6 +233,7 @@ public struct Lattice: Differentiable {
 }
 
 extension Lattice: CustomStringConvertible {
+  /// The plain text description of this instance that describes all nodes.
   public var description: String {
     """
     [
@@ -181,6 +244,8 @@ extension Lattice: CustomStringConvertible {
 }
 
 extension Lattice.Node: CustomStringConvertible {
+  /// The plain text description of this instance that describes all incoming
+  /// edges.
   public var description: String {
     var edgesStr: String
     if edges.isEmpty {
@@ -196,13 +261,19 @@ extension Lattice.Node: CustomStringConvertible {
 }
 
 extension Lattice.Edge: CustomStringConvertible {
+  /// The plain text description of this instance with all edge details.
   public var description: String {
     "[\(start)->\(end)] logp: \(logp), score: \(score.shortDescription), total score: \(totalScore.shortDescription), sentence: \(string)"
   }
 }
 
-/// SE-0259-esque equality with tolerance
 extension Lattice {
+  /// Returns true when all nodes in `self` are within `tolerance` of all
+  /// nodes in `other`. This behavior is modeled after SE-0259.
+  ///
+  /// - Parameter other: the instance to be compared with `self`.
+  /// - Parameter tolerance: the amount of variability considered acceptable
+  ///   in determining equality.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     guard self.positions.count == other.positions.count else {
       print("positions count mismatch: \(self.positions.count) != \(other.positions.count)")
@@ -221,6 +292,13 @@ extension Lattice {
 }
 
 extension Lattice.Node {
+  /// Returns true when all properties and edges in `self` are within
+  /// `tolerance` of all properties and edges in `other`. This behavior is
+  /// modeled after SE-0259.
+  ///
+  /// - Parameter other: the instance to be compared with `self`.
+  /// - Parameter tolerance: the amount of variability considered acceptable
+  ///   in determining equality.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     guard self.edges.count == other.edges.count else { return false }
 
@@ -243,6 +321,13 @@ extension Lattice.Node {
 }
 
 extension Lattice.Edge {
+  /// Returns true when the log likelihood and scores in `self` are within
+  /// `tolerance` of the log likelihood and scores in `other`. This behavior
+  /// is modeled after SE-0259.
+  ///
+  /// - Parameter other: the instance to be compared with `self`.
+  /// - Parameter tolerance: the amount of variability considered acceptable
+  ///   in determining equality.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     return self.start == other.start && self.end == other.end
       // TODO: figure out why the string equality is being ignored
diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift
index 273513436df..06add8d981f 100644
--- a/Support/Text/WordSeg/Alphabet.swift
+++ b/Support/Text/WordSeg/Alphabet.swift
@@ -28,10 +28,8 @@ public struct Alphabet {
 
   /// A marker denoting the end of a sequence.
   public let eos: Int32
-
   /// A marker denoting the end of a word.
   public let eow: Int32
-
   /// A marker used for padding inside a sequence.
   public let pad: Int32
 

From cf86c8cfdc17e8cacbfcd024dce6bc3226ebcca7 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 28 May 2020 20:00:26 +0000
Subject: [PATCH 05/30] Add semiring

---
 Models/Text/WordSeg/SemiRing.swift | 35 +++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift
index 6357c607557..c6854670036 100644
--- a/Models/Text/WordSeg/SemiRing.swift
+++ b/Models/Text/WordSeg/SemiRing.swift
@@ -22,9 +22,8 @@ import TensorFlow
   import Glibc
 #endif
 
-/// logSumExp(_:)
-///
-/// logSumExp (see https://en.wikipedia.org/wiki/LogSumExp)
+/// Returns a single tensor containing the log of the sum of the exponentials
+/// in `x`. Used for numerical stability when dealing with very small values.
 @differentiable
 public func logSumExp(_ x: [Tensor<Float>]) -> Tensor<Float> {
   // Deal with an empty array first.
@@ -32,37 +31,45 @@ public func logSumExp(_ x: [Tensor<Float>]) -> Tensor<Float> {
   return Tensor<Float>(stacking: x).logSumExp()
 }
 
-/// logSumExp(_:_:)
-///
-/// Specialized logSumExp for 2 tensor of floats.
+/// Returns a single tensor containing the log of the sum of the exponentials
+/// in `lhs` and `rhs`. Used for numerical stability when dealing with very
+/// small values.
 @differentiable
 public func logSumExp(_ lhs: Tensor<Float>, _ rhs: Tensor<Float>) -> Tensor<Float> {
   return logSumExp([lhs, rhs])
 }
 
-/// SemiRing
-///
-/// Represents a SemiRing
+/// A storage mechanism for scoring inside a lattice.
 public struct SemiRing: Differentiable {
+  /// The log likelihood.
   public var logp: Tensor<Float>
+  /// The regularization factor.
   public var logr: Tensor<Float>
 
+  /// Creates an instance with log likelihood `logp` and regularization
+  /// factor `logr`.
   @differentiable
   public init(logp: Tensor<Float>, logr: Tensor<Float>) {
     self.logp = logp
     self.logr = logr
   }
 
+  /// Creates an instance with log likelihood `logp` and regularization
+  /// factor `logr`.
   @differentiable
   public init(logp: Float, logr: Float) {
     self.logp = Tensor(logp)
     self.logr = Tensor(logr)
   }
 
+  /// The baseline score of zero.
   static var zero: SemiRing { SemiRing(logp: -Float.infinity, logr: -Float.infinity) }
+  /// The baseline score of one.
   static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) }
 }
 
+/// Multiplies `lhs` by `rhs`. Since scores are on a logarithmic scale,
+/// products become sums.
 @differentiable
 func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
   return SemiRing(
@@ -70,6 +77,7 @@ func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
     logr: logSumExp(lhs.logp + rhs.logr, rhs.logp + lhs.logr))
 }
 
+/// Sums `lhs` by `rhs`.
 @differentiable
 func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
   return SemiRing(
@@ -78,6 +86,7 @@ func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
 }
 
 extension Array where Element == SemiRing {
+  /// Returns a sum of all scores in the collection.
   @differentiable
   func sum() -> SemiRing {
     return SemiRing(
@@ -87,13 +96,19 @@ extension Array where Element == SemiRing {
 }
 
 extension SemiRing {
+  /// The plain text description of this instance with score details.
   var shortDescription: String {
     "(\(logp), \(logr))"
   }
 }
 
-/// SE-0259-esque equality with tolerance
 extension SemiRing {
+  /// Returns true when `self` is within `tolerance` of `other`. This behavior
+  /// is modeled after SE-0259.
+  ///
+  /// - Parameter other: the instance to be compared with `self`.
+  /// - Parameter tolerance: the amount of variability considered acceptable
+  ///   in determining equality.
   // TODO(abdulras) see if we can use ulp as a default tolerance
   @inlinable
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {

From 8c3812edda50e5ae113bfcc96baf892a5d496994 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 28 May 2020 22:33:51 +0000
Subject: [PATCH 06/30] Add SNLM

---
 Models/Text/WordSeg/Model.swift | 84 +++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 15 deletions(-)

diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift
index 94b08ddc714..f36429cfcc1 100644
--- a/Models/Text/WordSeg/Model.swift
+++ b/Models/Text/WordSeg/Model.swift
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 // Original Paper:
 // "Learning to Discover, Ground, and Use Words with Segmental Neural Language
 // Models"
@@ -18,22 +19,29 @@
 // https://www.aclweb.org/anthology/P19-1645.pdf
 // This implementation is not affiliated with DeepMind and has not been
 // verified by the authors.
+
 import ModelSupport
 import TensorFlow
 
-/// SNLM
-///
-/// A representation of the Segmental Neural Language Model.
-///
-/// \ref https://www.aclweb.org/anthology/P19-1645.pdf
+/// A Segmental Neural Language Model for word segmentation, as described in
+/// the above paper.
 public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
+  /// A set of configuration parameters that define model behavior.
   public struct Parameters {
+    /// The hidden unit size.
     public var ndim: Int
+    /// The dropout rate.
     public var dropoutProb: Double
+    /// The character vocabulary.
     public var chrVocab: Alphabet
+    /// The string vocabulary.
     public var strVocab: Lexicon
+    /// The power of the length penalty.
     public var order: Int
 
+    /// Creates an instance with `ndim` hidden units, `dropoutProb` dropout
+    /// rate, `chrVocab` alphabet, `strVocab` lexicon, and `order` power of
+    /// length penalty.
     public init(
       ndim: Int,
       dropoutProb: Double,
@@ -49,27 +57,40 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
     }
   }
 
+  /// The configuration parameters that define model behavior.
   @noDerivative public var parameters: Parameters
 
   // MARK: - Encoder
+  /// The embedding layer for the encoder.
   public var encoderEmbedding: Embedding<Float>
+  /// The LSTM layer for the encoder.
   public var encoderLSTM: LSTM<Float>
 
   // MARK: - Interpolation weight
+  /// The interpolation weight, which determines the proportion of
+  /// contributions from the lexical memory and character generation.
   public var mlpInterpolation: MLP
 
   // MARK: - Lexical memory
+  /// The lexical memory.
   public var mlpMemory: MLP
 
   // MARK: - Character-level decoder
+  /// The embedding layer for the decoder.
   public var decoderEmbedding: Embedding<Float>
+  /// The LSTM layer for the decoder.
   public var decoderLSTM: LSTM<Float>
+  /// The dense layer for the decoder.
   public var decoderDense: Dense<Float>
 
   // MARK: - Other layers
+  /// The dropout layer for both the encoder and decoder.
   public var dropout: Dropout<Float>
 
   // MARK: - Initializer
+  /// Creates an instance with the configuration defined by `parameters`.
+  ///
+  /// - Parameter parameters: the model configuration.
   public init(parameters: Parameters) {
     self.parameters = parameters
 
@@ -113,7 +134,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - Encode
-  /// Returns the hidden states of the encoder LSTM applied to the given sentence.
+  /// Returns the hidden states of the encoder LSTM applied to `x`.
+  ///
+  /// - Parameter x: the character sequence to encode.
   public func encode(_ x: CharacterSequence) -> [Tensor<Float>] {
     var embedded = encoderEmbedding(x.tensor)
     embedded = dropout(embedded)
@@ -125,7 +148,10 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - Decode
-  /// Returns log probabilities for each of the candidates.
+  /// Returns the log probabilities for each of the candidates.
+  ///
+  /// - Parameter candidates: the character sequences to decode.
+  /// - Parameter state: the hidden state from the encoder LSTM.
   public func decode(_ candidates: [CharacterSequence], _ state: Tensor<Float>) -> Tensor<Float> {
     // TODO(TF-433): Remove closure workaround when autodiff supports non-active rethrowing
     // functions (`Array.map`).
@@ -192,6 +218,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - buildLattice
+  /// Returns the log likelihood for `candidate` from the lexical memory
+  /// `logp_lex`.
+  ///
+  /// - Parameter logp_lex: all log likelihoods in the lexical memory.
+  /// - Parameter candidate: the character sequence for which to retrieve the
+  ///   log likelihood.
   func get_logp_lex(_ logp_lex: Tensor<Float>, _ candidate: CharacterSequence) -> Tensor<Float> {
     guard let index = parameters.strVocab.dictionary[candidate] else {
       return Tensor(-Float.infinity)
@@ -199,6 +231,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
     return logp_lex[Int(index)]
   }
 
+  /// Returns a complete lattice for `sentence` with a maximum length of
+  /// `maxLen`.
+  ///
+  /// - Parameter sentence: the character sequence used for determining
+  ///   segmentation.
+  /// - Parameter maxLen: the maximum allowable sequence length.
   @differentiable
   public func buildLattice(_ sentence: CharacterSequence, maxLen: Int) -> Lattice {
     var lattice = Lattice(count: sentence.count)
@@ -265,15 +303,17 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 }
 
 extension Array {
-  // NOTE(TF-1277): this mutating method exists as a workaround for `Array.subscript._modify` not
-  // being differentiable.
-  //
-  // Semantically, it behaves like `Array.subscript.set`.
+  /// Sets the `index`th element of `self` to `value`. Semantically, it
+  /// behaves like `Array.subscript.set`.
+  ///
+  /// - Note: this mutating method exists as a workaround for
+  ///   `Array.subscript._modify` not being differentiable (TF-1277).
   @inlinable
   mutating func update(at index: Int, to value: Element) {
     self[index] = value
   }
 
+  /// Returns the value and pullback of `self.update`.
   @usableFromInline
   @derivative(of: update)
   mutating func vjpUpdate(at index: Int, to value: Element) -> (
@@ -290,17 +330,29 @@ extension Array {
   }
 }
 
+/// A multilayer perceptron with three layers.
 public struct MLP: Layer {
+  /// The first dense layer.
   public var dense1: Dense<Float>
+  /// The dropout layer.
   public var dropout: Dropout<Float>
+  /// The second dense layer.
   public var dense2: Dense<Float>
 
+  /// Creates an instance with input size `nIn`, `nHidden` hidden units,
+  /// dropout probability `dropoutProbability` and output size `nOut`.
+  ///
+  /// - Parameter nIn: input size.
+  /// - Parameter nHidden: number of hidden units.
+  /// - Parameter nOut: output size.
+  /// - Parameter dropoutProbability: probability that an input is dropped.
   public init(nIn: Int, nHidden: Int, nOut: Int, dropoutProbability: Double) {
     dense1 = Dense(inputSize: nIn, outputSize: nHidden, activation: tanh)
     dropout = Dropout(probability: dropoutProbability)
     dense2 = Dense(inputSize: nHidden, outputSize: nOut, activation: logSoftmax)
   }
 
+  /// Returns the result of applying all three layers in sequence to `input`.
   @differentiable
   public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
     return dense2(dropout(dense1(input)))
@@ -308,10 +360,11 @@ public struct MLP: Layer {
 }
 
 extension Tensor {
-  // NOTE(TF-1008): this is a workaround for TF-1008 that is needed for differentiation
-  // correctness.
-  //
-  // Remove this when differentiation uses per-instance zeros
+  /// Returns `self`.
+  ///
+  /// - Note: this is a workaround for TF-1008 that is needed for
+  /// differentiation correctness.
+  // TODO: Remove this when differentiation uses per-instance zeros
   // (`Differentiable.zeroTangentVectorInitializer`) instead of static zeros
   // (`AdditiveArithmetic.zero`).
   @differentiable(where Scalar: TensorFlowFloatingPoint)
@@ -319,6 +372,7 @@ extension Tensor {
     self
   }
 
+  /// Returns the value and pullback of `self.identityADHack`.
   @derivative(of: identityADHack)
   func vjpIdentityADHack() -> (
     value: Tensor, pullback: (Tensor) -> Tensor

From d36a20c39e659d37190d97063b9145d3012b0308 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 28 May 2020 23:57:59 +0000
Subject: [PATCH 07/30] Add bullets for throws
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace various verbs with “returns”
---
 Datasets/WordSeg/WordSegDataset.swift        | 19 ++++++++++++++++---
 Support/Text/WordSeg/CharacterSequence.swift |  2 ++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 93476f74488..5e71ac18a13 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -46,6 +46,8 @@ public struct WordSegDataset {
   /// Returns a list of records parsed from `data` in UTF8.
   ///
   /// - Parameter data: text in UTF8 format.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   private static func load(data: Data) throws -> [String] {
     guard let contents: String = String(data: data, encoding: .utf8) else {
       throw CharacterErrors.nonUtf8Data
@@ -53,8 +55,9 @@ public struct WordSegDataset {
     return load(contents: contents)
   }
 
-  /// Separates `contents` into a collection of strings by newlines, trimming
-  /// leading and trailing whitespace and excluding blank lines.
+  /// Returns a collection of strings created by separating `contents` by
+  /// newlines, trimming leading and trailing whitespace, and excluding blank
+  /// lines.
   ///
   /// - Parameter contents: text to be separated by newline.
   private static func load(contents: String) -> [String] {
@@ -102,10 +105,12 @@ public struct WordSegDataset {
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
 
-  /// Creates a collection of records to be used with the WordSeg model.
+  /// Returns a collection of records to be used with the WordSeg model.
   ///
   /// - Parameter dataset: text to be converted.
   /// - Parameter alphabet: set of all characters used in `dataset`.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
     -> [WordSegRecord]
   {
@@ -123,6 +128,8 @@ public struct WordSegDataset {
   ///
   /// - Parameter dataset: text to be converted.
   /// - Parameter alphabet: set of all characters used in `dataset`.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
     -> [WordSegRecord]?
   {
@@ -135,6 +142,8 @@ public struct WordSegDataset {
 
   /// Creates an instance containing `WordSegRecords` from the default
   /// location.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   public init() throws {
     let downloadDetails = DownloadDetails()
     let localStorageDirectory: URL = FileManager.default.temporaryDirectory
@@ -165,6 +174,8 @@ public struct WordSegDataset {
   /// - Parameter training: path to the file containing training data.
   /// - Parameter validation: path to the file containing validation data.
   /// - Parameter testing: path to the file containing test data.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   public init(
     training trainingFile: String,
     validation validationFile: String? = nil,
@@ -202,6 +213,8 @@ public struct WordSegDataset {
   /// - Parameter training: contents of the training data.
   /// - Parameter validation: contents of the validation data.
   /// - Parameter testing: contents of the test data.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   public init(
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
   )
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index 2eb4f6714bb..b8df75fd8ea 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -33,6 +33,8 @@ public struct CharacterSequence: Hashable {
   /// - Parameter alphabet: character to integer mapping.
   /// - Parameter appendingEoSTo: string to be converted to a sequence of
   ///   integers.
+  ///
+  /// - Throws: An error of type 'CharacterErrors'.
   public init(alphabet: Alphabet, appendingEoSTo string: String) throws {
     var characters = [Int32]()
     characters.reserveCapacity(string.count + 1)

From 951a98b96f060690a98eeb09d0341d81f3ee1069 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 00:48:22 +0000
Subject: [PATCH 08/30] Rename WordSegRecord to Phrase

Rename DownloadDetails to ReferenceArchive
Combine URL with filename and extension
Update main summary in WordSegDataset
Add blank line before doc comments in WordSegDataset
---
 .../{WordSegRecord.swift => Phrase.swift}     |  4 +-
 Datasets/WordSeg/WordSegDataset.swift         | 93 +++++++++++--------
 Examples/WordSeg/main.swift                   | 16 ++--
 3 files changed, 65 insertions(+), 48 deletions(-)
 rename Datasets/WordSeg/{WordSegRecord.swift => Phrase.swift} (97%)

diff --git a/Datasets/WordSeg/WordSegRecord.swift b/Datasets/WordSeg/Phrase.swift
similarity index 97%
rename from Datasets/WordSeg/WordSegRecord.swift
rename to Datasets/WordSeg/Phrase.swift
index 8445477a1eb..46da27bb15b 100644
--- a/Datasets/WordSeg/WordSegRecord.swift
+++ b/Datasets/WordSeg/Phrase.swift
@@ -15,9 +15,11 @@
 import ModelSupport
 
 /// A sequence of text for use in word segmentation.
-public struct WordSegRecord {
+public struct Phrase {
+
   /// A raw, unprocessed sequence of text.
   public let plainText: String
+
   /// A sequence of text in numeric form, derived from `plainText`.
   public let numericalizedText: CharacterSequence
 
diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 131be208958..4ff8ad1521e 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -15,30 +15,38 @@
 import Foundation
 import ModelSupport
 
-/// A collection of raw and processed text used for training and validation
-/// of word segmentation models.
+/// A dataset targeted at the problem of word segmentation.
+///
+/// The reference archive was published in the paper "Learning to Discover,
+/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya
+/// Kawakami, Chris Dyer, and Phil Blunsom:
+/// https://www.aclweb.org/anthology/P19-1645.pdf.
 public struct WordSegDataset {
-  /// A collection of text used for training.
-  public let training: [WordSegRecord]
-  /// A collection of text used for testing.
-  public private(set) var testing: [WordSegRecord]?
-  /// A collection of text used for validation.
-  public private(set) var validation: [WordSegRecord]?
+
+  /// The text used for training.
+  public let trainingPhrases: [Phrase]
+
+  /// The text used for testing.
+  public private(set) var testingPhrases: [Phrase]?
+
+  /// The text used for validation.
+  public private(set) var validationPhrases: [Phrase]?
+
   /// The set of characters found in all included texts.
   public let alphabet: Alphabet
 
   /// Details used for downloading source data.
-  private struct DownloadDetails {
+  private struct ReferenceArchive {
+
     /// The location of the archive.
-    var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")!
-    /// The basename of the archive.
-    var archiveFileName = "seg"
-    /// The extension of the archive.
-    var archiveExtension = "zip"
+    var location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
+
     /// The path to the test source.
     var testingFilePath = "br/br-text/te.txt"
+
     /// The path to the training source.
     var trainingFilePath = "br/br-text/tr.txt"
+
     /// The path to the validation source.
     var validationFilePath = "br/br-text/va.txt"
   }
@@ -71,7 +79,7 @@ public struct WordSegDataset {
     return strings
   }
 
-  /// Returns an alphabet composed of all characters found in `training` and
+  /// Returns an alphabet composed of all characters found in `trainingPhrases` and
   /// `otherSequences`.
   ///
   /// - Parameter training: full text of the training data.
@@ -112,11 +120,11 @@ public struct WordSegDataset {
   ///
   /// - Throws: An error of type 'CharacterErrors'.
   private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
-    -> [WordSegRecord]
+    -> [Phrase]
   {
     return try dataset.map {
       let trimmed = $0.components(separatedBy: .whitespaces).joined()
-      return try WordSegRecord(
+      return try Phrase(
         plainText: $0,
         numericalizedText: CharacterSequence(
           alphabet: alphabet, appendingEoSTo: trimmed))
@@ -131,45 +139,48 @@ public struct WordSegDataset {
   ///
   /// - Throws: An error of type 'CharacterErrors'.
   private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
-    -> [WordSegRecord]?
+    -> [Phrase]?
   {
     if let ds = dataset {
-      let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet)  // Use tmp to disambiguate function
+      let tmp: [Phrase] = try convertDataset(ds, alphabet: alphabet)  // Use tmp to disambiguate function
       return tmp
     }
     return nil
   }
 
-  /// Creates an instance containing `WordSegRecords` from the default
+  /// Creates an instance containing `Phrase`s from the default
   /// location.
   ///
   /// - Throws: An error of type 'CharacterErrors'.
   public init() throws {
-    let downloadDetails = DownloadDetails()
+    let referenceArchive = ReferenceArchive()
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
-    WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails)
+    WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, referenceArchive: referenceArchive)
 
+    let archiveFileName =
+      referenceArchive
+      .location.deletingPathExtension().lastPathComponent
     let archiveDirectory =
       localStorageDirectory
-      .appendingPathComponent(downloadDetails.archiveFileName)
+      .appendingPathComponent(archiveFileName)
     let trainingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.trainingFilePath).path
+      .appendingPathComponent(referenceArchive.trainingFilePath).path
     let validationFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.validationFilePath).path
+      .appendingPathComponent(referenceArchive.validationFilePath).path
     let testingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.testingFilePath).path
+      .appendingPathComponent(referenceArchive.testingFilePath).path
 
     try self.init(
       training: trainingFilePath, validation: validationFilePath,
       testing: testingFilePath)
   }
 
-  /// Creates an instance containing `WordSegRecords` from the given files.
+  /// Creates an instance containing `Phrase`s from the given files.
   ///
   /// - Parameter training: path to the file containing training data.
   /// - Parameter validation: path to the file containing validation data.
@@ -203,12 +214,12 @@ public struct WordSegDataset {
       testing = try Self.load(data: data)
     }
     self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.training = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
+    self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
-  /// Creates an instance containing `WordSegRecords` from the given data.
+  /// Creates an instance containing `Phrase`s from the given data.
   ///
   /// - Parameter training: contents of the training data.
   /// - Parameter validation: contents of the validation data.
@@ -231,17 +242,17 @@ public struct WordSegDataset {
     }
 
     self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.training = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
+    self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
   /// Downloads and unpacks the source archive if it does not exist locally.
   ///
   /// - Parameter directory: local directory to store files.
-  /// - Parameter downloadDetails: where to find the source archive.
+  /// - Parameter referenceArchive: where to find the source archive.
   private static func downloadIfNotPresent(
-    to directory: URL, downloadDetails: DownloadDetails
+    to directory: URL, referenceArchive: ReferenceArchive
   ) {
     let downloadPath = directory.path
     let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
@@ -250,11 +261,15 @@ public struct WordSegDataset {
 
     guard !directoryExists || directoryEmpty else { return }
 
+    let remoteRoot = referenceArchive.location.deletingLastPathComponent()
+    let filename = referenceArchive.location.deletingPathExtension().lastPathComponent
+    let fileExtension = referenceArchive.location.pathExtension
+
     // Downloads and extracts dataset files.
     let _ = DatasetUtilities.downloadResource(
-      filename: downloadDetails.archiveFileName,
-      fileExtension: downloadDetails.archiveExtension,
-      remoteRoot: downloadDetails.archiveLocation,
+      filename: filename,
+      fileExtension: fileExtension,
+      remoteRoot: remoteRoot,
       localStorageDirectory: directory, extract: true)
   }
 }
diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift
index 98ca33e2104..a09db0ce51f 100644
--- a/Examples/WordSeg/main.swift
+++ b/Examples/WordSeg/main.swift
@@ -50,7 +50,7 @@ default:
   usage()
 }
 
-let sequences = dataset.training.map { $0.numericalizedText }
+let sequences = dataset.trainingPhrases.map { $0.numericalizedText }
 let lexicon = Lexicon(
   from: sequences,
   alphabet: dataset.alphabet,
@@ -76,8 +76,8 @@ for epoch in 1...maxEpochs {
   Context.local.learningPhase = .training
   var trainingLossSum: Float = 0
   var trainingBatchCount = 0
-  for record in dataset.training {
-    let sentence = record.numericalizedText
+  for phrase in dataset.trainingPhrases {
+    let sentence = phrase.numericalizedText
     let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor<Float> in
       let lattice = model.buildLattice(sentence, maxLen: maxLength)
       let score = lattice[sentence.count].semiringScore
@@ -103,7 +103,7 @@ for epoch in 1...maxEpochs {
   trainingLossHistory.append(trainingLoss)
   reduceLROnPlateau(lossHistory: trainingLossHistory, optimizer: optimizer)
 
-  guard let validationDataset = dataset.validation else {
+  guard let validationPhrases = dataset.validationPhrases else {
     print(
       """
       [Epoch \(epoch)] \
@@ -127,8 +127,8 @@ for epoch in 1...maxEpochs {
   var validationBatchCount = 0
   var validationCharacterCount = 0
   var validationPlainText: String = ""
-  for record in validationDataset {
-    let sentence = record.numericalizedText
+  for phrase in validationPhrases {
+    let sentence = phrase.numericalizedText
     var lattice = model.buildLattice(sentence, maxLen: maxLength)
     let score = lattice[sentence.count].semiringScore
 
@@ -137,8 +137,8 @@ for epoch in 1...maxEpochs {
     validationCharacterCount += sentence.count
 
     // View a sample segmentation once per epoch.
-    if validationBatchCount == validationDataset.count {
-      let bestPath = lattice.viterbi(sentence: record.numericalizedText)
+    if validationBatchCount == validationPhrases.count {
+      let bestPath = lattice.viterbi(sentence: phrase.numericalizedText)
       validationPlainText = Lattice.pathToPlainText(path: bestPath, alphabet: dataset.alphabet)
     }
   }

From 4de6e7d8eba9b8f22629ca5e1b3c903027ab2180 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 01:00:05 +0000
Subject: [PATCH 09/30] Update CMakeLists

---
 Datasets/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Datasets/CMakeLists.txt b/Datasets/CMakeLists.txt
index 3f1b519cb7d..b2344d69e4c 100644
--- a/Datasets/CMakeLists.txt
+++ b/Datasets/CMakeLists.txt
@@ -21,7 +21,7 @@ add_library(Datasets
   TensorPair.swift
   TextUnsupervised/TextUnsupervised.swift
   WordSeg/WordSegDataset.swift
-  WordSeg/WordSegRecord.swift
+  WordSeg/Phrase.swift
   ImageSegmentationDataset.swift
   OxfordIIITPets/OxfordIIITPets.swift)
 target_link_libraries(Datasets PUBLIC

From 060e88b7d62f08c5b9263e9a40460575d27d3040 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 02:38:16 +0000
Subject: [PATCH 10/30] Clarify more summaries.

Remove explicit parameter descriptions and add them to summaries.
Handle errors instead of throwing.
Remove CharacterErrors.nonUtf8Data.
Update attribute names in dataset tests.
---
 Datasets/WordSeg/WordSegDataset.swift         | 143 ++++++++----------
 Support/Text/WordSeg/CharacterSequence.swift  |   3 -
 .../WordSeg/WordSegDatasetTests.swift         |  16 +-
 3 files changed, 67 insertions(+), 95 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 4ff8ad1521e..119b047d58a 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -32,10 +32,10 @@ public struct WordSegDataset {
   /// The text used for validation.
   public private(set) var validationPhrases: [Phrase]?
 
-  /// The set of characters found in all included texts.
+  /// The union of all characters in the included dataset.
   public let alphabet: Alphabet
 
-  /// Details used for downloading source data.
+  /// A pointer to source data.
   private struct ReferenceArchive {
 
     /// The location of the archive.
@@ -51,23 +51,15 @@ public struct WordSegDataset {
     var validationFilePath = "br/br-text/va.txt"
   }
 
-  /// Returns a list of records parsed from `data` in UTF8.
-  ///
-  /// - Parameter data: text in UTF8 format.
-  ///
-  /// - Throws: An error of type 'CharacterErrors'.
-  private static func load(data: Data) throws -> [String] {
+  /// Returns the text of all phrases parsed from `data` in UTF8.
+  private static func load(data: Data) -> [String] {
     guard let contents: String = String(data: data, encoding: .utf8) else {
-      throw CharacterErrors.nonUtf8Data
+      return []
     }
     return load(contents: contents)
   }
 
-  /// Returns a collection of strings created by separating `contents` by
-  /// newlines, trimming leading and trailing whitespace, and excluding blank
-  /// lines.
-  ///
-  /// - Parameter contents: text to be separated by newline.
+  /// Returns the text of all phrases from `contents`.
   private static func load(contents: String) -> [String] {
     var strings = [String]()
 
@@ -79,12 +71,8 @@ public struct WordSegDataset {
     return strings
   }
 
-  /// Returns an alphabet composed of all characters found in `trainingPhrases` and
-  /// `otherSequences`.
+  /// Returns the union of all characters in `training` and `otherSequences`.
   ///
-  /// - Parameter training: full text of the training data.
-  /// - Parameter otherSequences: optional full text of the validation and
-  ///   test data.
   /// - Parameter eos: text to be used as the end of sequence marker.
   /// - Parameter eow: text to be used as the end of word marker.
   /// - Parameter pad: text to be used as the padding marker.
@@ -113,45 +101,33 @@ public struct WordSegDataset {
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
 
-  /// Returns a collection of records to be used with the WordSeg model.
-  ///
-  /// - Parameter dataset: text to be converted.
-  /// - Parameter alphabet: set of all characters used in `dataset`.
+  /// Returns phrases from `dataset`, using `alphabet`, to be used with the
+  /// WordSeg model.
   ///
-  /// - Throws: An error of type 'CharacterErrors'.
-  private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
+  /// - Note: Omits any part of the dataset that cannot be converted to
+  ///   `CharacterSequence`.
+  private static func convertDataset(_ dataset: [String], alphabet: Alphabet)
     -> [Phrase]
   {
-    return try dataset.map {
-      let trimmed = $0.components(separatedBy: .whitespaces).joined()
-      return try Phrase(
-        plainText: $0,
-        numericalizedText: CharacterSequence(
-          alphabet: alphabet, appendingEoSTo: trimmed))
+    var phrases = [Phrase]()
+
+    for data in dataset {
+      let trimmed = data.components(separatedBy: .whitespaces).joined()
+      guard let numericalizedText = try? CharacterSequence(
+          alphabet: alphabet, appendingEoSTo: trimmed) else { continue }
+      let phrase = Phrase(
+        plainText: data,
+        numericalizedText: numericalizedText)
+      phrases.append(phrase)
     }
-  }
 
-  /// Returns a collection of records to be used with the WordSeg model, or
-  /// `nil` if `dataset` is empty.
-  ///
-  /// - Parameter dataset: text to be converted.
-  /// - Parameter alphabet: set of all characters used in `dataset`.
-  ///
-  /// - Throws: An error of type 'CharacterErrors'.
-  private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
-    -> [Phrase]?
-  {
-    if let ds = dataset {
-      let tmp: [Phrase] = try convertDataset(ds, alphabet: alphabet)  // Use tmp to disambiguate function
-      return tmp
-    }
-    return nil
+    return phrases
   }
 
-  /// Creates an instance containing `Phrase`s from the default
-  /// location.
+  /// Creates an instance containing phrases from the default location.
   ///
-  /// - Throws: An error of type 'CharacterErrors'.
+  /// - Throws: an error in the Cocoa domain, if the default training file
+  ///   cannot be read.
   public init() throws {
     let referenceArchive = ReferenceArchive()
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
@@ -180,13 +156,11 @@ public struct WordSegDataset {
       testing: testingFilePath)
   }
 
-  /// Creates an instance containing `Phrase`s from the given files.
+  /// Creates an instance containing phrases from `trainingFile`, and
+  /// optionally `validationFile` and `testingFile`.
   ///
-  /// - Parameter training: path to the file containing training data.
-  /// - Parameter validation: path to the file containing validation data.
-  /// - Parameter testing: path to the file containing test data.
-  ///
-  /// - Throws: An error of type 'CharacterErrors'.
+  /// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be
+  ///   read.
   public init(
     training trainingFile: String,
     validation validationFile: String? = nil,
@@ -195,62 +169,63 @@ public struct WordSegDataset {
     let trainingData = try Data(
       contentsOf: URL(fileURLWithPath: trainingFile),
       options: .alwaysMapped)
-    let training = try Self.load(data: trainingData)
+    let training = Self.load(data: trainingData)
 
-    var validation: [String]? = nil
-    var testing: [String]? = nil
+    let validation: [String]
+    let testing: [String]
 
     if let validationFile = validationFile {
       let data = try Data(
         contentsOf: URL(fileURLWithPath: validationFile),
         options: .alwaysMapped)
-      validation = try Self.load(data: data)
+      validation = Self.load(data: data)
+    } else {
+      validation = [String]()
     }
 
     if let testingFile = testingFile {
       let data: Data = try Data(
         contentsOf: URL(fileURLWithPath: testingFile),
         options: .alwaysMapped)
-      testing = try Self.load(data: data)
+      testing = Self.load(data: data)
+    } else {
+      testing = [String]()
     }
+
     self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet)
+    self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
-  /// Creates an instance containing `Phrase`s from the given data.
-  ///
-  /// - Parameter training: contents of the training data.
-  /// - Parameter validation: contents of the validation data.
-  /// - Parameter testing: contents of the test data.
-  ///
-  /// - Throws: An error of type 'CharacterErrors'.
+  /// Creates an instance containing phrases from `trainingData`, and
+  /// optionally `validationData` and `testingData`.
   public init(
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
   )
-    throws
   {
-    let training = try Self.load(data: trainingData)
-    var validation: [String]? = nil
-    var testing: [String]? = nil
+    let training = Self.load(data: trainingData)
+    let validation: [String]
+    let testing: [String]
     if let validationData = validationData {
-      validation = try Self.load(data: validationData)
+      validation = Self.load(data: validationData)
+    } else {
+      validation = [String]()
     }
     if let testingData = testingData {
-      testing = try Self.load(data: testingData)
+      testing = Self.load(data: testingData)
+    } else {
+      testing = [String]()
     }
 
     self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet)
+    self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
-  /// Downloads and unpacks the source archive if it does not exist locally.
-  ///
-  /// - Parameter directory: local directory to store files.
-  /// - Parameter referenceArchive: where to find the source archive.
+  /// Downloads and unpacks `referenceArchive` to `directory` if it does not
+  /// exist locally.
   private static func downloadIfNotPresent(
     to directory: URL, referenceArchive: ReferenceArchive
   ) {
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index b8df75fd8ea..d4c4101afba 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -100,7 +100,6 @@ extension CharacterSequence: CustomStringConvertible {
 /// An error that can be encountered when processing characters.
 public enum CharacterErrors: Error {
   case unknownCharacter(character: Character, index: Int, sentence: String)
-  case nonUtf8Data
 }
 
 extension CharacterErrors: CustomStringConvertible {
@@ -110,8 +109,6 @@ extension CharacterErrors: CustomStringConvertible {
     case let .unknownCharacter(character, index, sentence):
       return
         "Unknown character '\(character)' encountered at index \(index) while converting sentence \"\(sentence)\" to a character sequence."
-    case .nonUtf8Data:
-      return "Non-UTF8 data encountered."
     }
   }
 }
diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
index e3ea2da3d37..06f073dd542 100644
--- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
+++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
@@ -20,26 +20,26 @@ class WordSegDatasetTests: XCTestCase {
   func testCreateWordSegDataset() {
     do {
       let dataset = try WordSegDataset()
-      XCTAssertEqual(dataset.training.count, 7832)
-      XCTAssertEqual(dataset.validation!.count, 979)
-      XCTAssertEqual(dataset.testing!.count, 979)
+      XCTAssertEqual(dataset.trainingPhrases.count, 7832)
+      XCTAssertEqual(dataset.validationPhrases!.count, 979)
+      XCTAssertEqual(dataset.testingPhrases!.count, 979)
 
       // Check the first example in each set.
       let trainingExample: [Int32] = [
         26, 16, 22, 24, 2, 15, 21, 21, 16, 20, 6, 6, 21,
         9, 6, 3, 16, 16, 12, 28,
       ]
-      XCTAssertEqual(dataset.training[0].numericalizedText.characters, trainingExample)
+      XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample)
 
       let validationExample: [Int32] = [9, 6, 13, 13, 16, 14, 10, 14, 10, 28]
-      XCTAssertEqual(dataset.validation![0].numericalizedText.characters, validationExample)
+      XCTAssertEqual(dataset.validationPhrases![0].numericalizedText.characters, validationExample)
 
       let testingExample: [Int32] = [
         13, 6, 21, 14, 6, 20, 6, 6, 10, 7, 10, 4,
         2, 15, 20, 6, 6, 2, 15, 26, 3, 16, 5, 26, 10, 15, 21, 9, 2, 21, 14, 10,
         19, 19, 16, 19, 28,
       ]
-      XCTAssertEqual(dataset.testing![0].numericalizedText.characters, testingExample)
+      XCTAssertEqual(dataset.testingPhrases![0].numericalizedText.characters, testingExample)
     } catch {
       XCTFail(error.localizedDescription)
     }
@@ -57,12 +57,12 @@ class WordSegDatasetTests: XCTestCase {
         Data(
           bytesNoCopy: UnsafeMutableRawPointer(mutating: address),
           count: pointer.count, deallocator: .none)
-      dataset = try? WordSegDataset(training: training, validation: nil, testing: nil)
+      dataset = WordSegDataset(training: training, validation: nil, testing: nil)
     }
 
     // 'a', 'h', 'l', 'p', '</s>', '</w>', '<pad>'
     XCTAssertEqual(dataset?.alphabet.count, 7)
-    XCTAssertEqual(dataset?.training.count, 1)
+    XCTAssertEqual(dataset?.trainingPhrases.count, 1)
   }
 
   static var allTests = [

From 04c58bbc23ca29aec072e4c6571310ea2baa80ed Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Mon, 1 Jun 2020 22:42:45 -0400
Subject: [PATCH 11/30] Lint

---
 Datasets/WordSeg/WordSegDataset.swift | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 119b047d58a..f45a3374007 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -113,8 +113,10 @@ public struct WordSegDataset {
 
     for data in dataset {
       let trimmed = data.components(separatedBy: .whitespaces).joined()
-      guard let numericalizedText = try? CharacterSequence(
-          alphabet: alphabet, appendingEoSTo: trimmed) else { continue }
+      guard
+        let numericalizedText = try? CharacterSequence(
+          alphabet: alphabet, appendingEoSTo: trimmed)
+      else { continue }
       let phrase = Phrase(
         plainText: data,
         numericalizedText: numericalizedText)
@@ -133,7 +135,8 @@ public struct WordSegDataset {
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
-    WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, referenceArchive: referenceArchive)
+    WordSegDataset.downloadIfNotPresent(
+      to: localStorageDirectory, referenceArchive: referenceArchive)
 
     let archiveFileName =
       referenceArchive
@@ -202,8 +205,7 @@ public struct WordSegDataset {
   /// optionally `validationData` and `testingData`.
   public init(
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
-  )
-  {
+  ) {
     let training = Self.load(data: trainingData)
     let validation: [String]
     let testing: [String]

From 079c47f95c2470f4bada9c0e49b864405fc05b9f Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 15:51:42 +0000
Subject: [PATCH 12/30] Add blank lines

Update summary in Phrase to include parameter names.
---
 Datasets/WordSeg/Phrase.swift                |  7 ++----
 Models/Text/WordSeg/Lattice.swift            | 16 ++++++++++++++
 Models/Text/WordSeg/Model.swift              | 23 ++++++++++++++++++++
 Models/Text/WordSeg/SemiRing.swift           |  6 +++++
 Support/Text/WordSeg/Alphabet.swift          |  3 +++
 Support/Text/WordSeg/CharacterSequence.swift |  6 +++++
 Support/Text/WordSeg/Lexicon.swift           |  1 +
 7 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/Datasets/WordSeg/Phrase.swift b/Datasets/WordSeg/Phrase.swift
index 46da27bb15b..e2ccb02fad1 100644
--- a/Datasets/WordSeg/Phrase.swift
+++ b/Datasets/WordSeg/Phrase.swift
@@ -23,11 +23,8 @@ public struct Phrase {
   /// A sequence of text in numeric form, derived from `plainText`.
   public let numericalizedText: CharacterSequence
 
-  /// Creates an instance containing both raw and processed forms of a
-  /// sequence of text.
-  ///
-  /// - Parameter plainText: raw, unprocessed text.
-  /// - Parameter numericalizedText: processed text in numeric form.
+  /// Creates an instance containing both raw (`plainText`) and processed
+  /// (`numericalizedText`) forms of a sequence of text.
   public init(plainText: String, numericalizedText: CharacterSequence) {
     self.plainText = plainText
     self.numericalizedText = numericalizedText
diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift
index cda03ba5218..ddd5168883d 100644
--- a/Models/Text/WordSeg/Lattice.swift
+++ b/Models/Text/WordSeg/Lattice.swift
@@ -27,22 +27,28 @@ import TensorFlow
 /// sequence. The path with the best score provides the most likely
 /// segmentation at inference.
 public struct Lattice: Differentiable {
+
   /// Represents a word.
   ///
   /// At each character position, an edge is constructed for every possible
   /// segmentation of the preceding portion of the sequence.
   public struct Edge: Differentiable {
+
     /// The node position immediately preceding this edge.
     @noDerivative public var start: Int
+
     /// The node position immediately following this edge.
     @noDerivative public var end: Int
+
     /// The characters composing a word.
     @noDerivative public var string: CharacterSequence
+
     /// The log likelihood of this segmentation.
     public var logp: Tensor<Float>
 
     /// The expected score for this segmentation.
     public var score: SemiRing
+
     /// The expected total score for this segmentation.
     public var totalScore: SemiRing
 
@@ -105,12 +111,16 @@ public struct Lattice: Differentiable {
   /// - Note: Scores are only meaningful in relation to incoming edges and the
   ///   start node has no incoming edges.
   public struct Node: Differentiable {
+
     /// The incoming edge with the highest score.
     @noDerivative public var bestEdge: Edge?
+
     /// The score of the best incoming edge.
     public var bestScore: Float = 0.0
+
     /// All incoming edges.
     public var edges = [Edge]()
+
     /// A composite score of all incoming edges.
     public var semiringScore: SemiRing = SemiRing.one
 
@@ -233,6 +243,7 @@ public struct Lattice: Differentiable {
 }
 
 extension Lattice: CustomStringConvertible {
+
   /// The plain text description of this instance that describes all nodes.
   public var description: String {
     """
@@ -244,6 +255,7 @@ extension Lattice: CustomStringConvertible {
 }
 
 extension Lattice.Node: CustomStringConvertible {
+
   /// The plain text description of this instance that describes all incoming
   /// edges.
   public var description: String {
@@ -261,6 +273,7 @@ extension Lattice.Node: CustomStringConvertible {
 }
 
 extension Lattice.Edge: CustomStringConvertible {
+
   /// The plain text description of this instance with all edge details.
   public var description: String {
     "[\(start)->\(end)] logp: \(logp), score: \(score.shortDescription), total score: \(totalScore.shortDescription), sentence: \(string)"
@@ -268,6 +281,7 @@ extension Lattice.Edge: CustomStringConvertible {
 }
 
 extension Lattice {
+
   /// Returns true when all nodes in `self` are within `tolerance` of all
   /// nodes in `other`. This behavior is modeled after SE-0259.
   ///
@@ -292,6 +306,7 @@ extension Lattice {
 }
 
 extension Lattice.Node {
+
   /// Returns true when all properties and edges in `self` are within
   /// `tolerance` of all properties and edges in `other`. This behavior is
   /// modeled after SE-0259.
@@ -321,6 +336,7 @@ extension Lattice.Node {
 }
 
 extension Lattice.Edge {
+
   /// Returns true when the log likelihood and scores in `self` are within
   /// `tolerance` of the log likelihood and scores in `other`. This behavior
   /// is modeled after SE-0259.
diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift
index f36429cfcc1..b5c341f97f0 100644
--- a/Models/Text/WordSeg/Model.swift
+++ b/Models/Text/WordSeg/Model.swift
@@ -26,16 +26,22 @@ import TensorFlow
 /// A Segmental Neural Language Model for word segmentation, as described in
 /// the above paper.
 public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
+
   /// A set of configuration parameters that define model behavior.
   public struct Parameters {
+
     /// The hidden unit size.
     public var ndim: Int
+
     /// The dropout rate.
     public var dropoutProb: Double
+
     /// The character vocabulary.
     public var chrVocab: Alphabet
+
     /// The string vocabulary.
     public var strVocab: Lexicon
+
     /// The power of the length penalty.
     public var order: Int
 
@@ -61,33 +67,42 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   @noDerivative public var parameters: Parameters
 
   // MARK: - Encoder
+
   /// The embedding layer for the encoder.
   public var encoderEmbedding: Embedding<Float>
+
   /// The LSTM layer for the encoder.
   public var encoderLSTM: LSTM<Float>
 
   // MARK: - Interpolation weight
+
   /// The interpolation weight, which determines the proportion of
   /// contributions from the lexical memory and character generation.
   public var mlpInterpolation: MLP
 
   // MARK: - Lexical memory
+
   /// The lexical memory.
   public var mlpMemory: MLP
 
   // MARK: - Character-level decoder
+
   /// The embedding layer for the decoder.
   public var decoderEmbedding: Embedding<Float>
+
   /// The LSTM layer for the decoder.
   public var decoderLSTM: LSTM<Float>
+
   /// The dense layer for the decoder.
   public var decoderDense: Dense<Float>
 
   // MARK: - Other layers
+
   /// The dropout layer for both the encoder and decoder.
   public var dropout: Dropout<Float>
 
   // MARK: - Initializer
+
   /// Creates an instance with the configuration defined by `parameters`.
   ///
   /// - Parameter parameters: the model configuration.
@@ -134,6 +149,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - Encode
+
   /// Returns the hidden states of the encoder LSTM applied to `x`.
   ///
   /// - Parameter x: the character sequence to encode.
@@ -148,6 +164,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - Decode
+
   /// Returns the log probabilities for each of the candidates.
   ///
   /// - Parameter candidates: the character sequences to decode.
@@ -218,6 +235,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   // MARK: - buildLattice
+
   /// Returns the log likelihood for `candidate` from the lexical memory
   /// `logp_lex`.
   ///
@@ -303,6 +321,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 }
 
 extension Array {
+
   /// Sets the `index`th element of `self` to `value`. Semantically, it
   /// behaves like `Array.subscript.set`.
   ///
@@ -332,10 +351,13 @@ extension Array {
 
 /// A multilayer perceptron with three layers.
 public struct MLP: Layer {
+
   /// The first dense layer.
   public var dense1: Dense<Float>
+
   /// The dropout layer.
   public var dropout: Dropout<Float>
+
   /// The second dense layer.
   public var dense2: Dense<Float>
 
@@ -360,6 +382,7 @@ public struct MLP: Layer {
 }
 
 extension Tensor {
+
   /// Returns `self`.
   ///
   /// - Note: this is a workaround for TF-1008 that is needed for
diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift
index c6854670036..0c7809b10c9 100644
--- a/Models/Text/WordSeg/SemiRing.swift
+++ b/Models/Text/WordSeg/SemiRing.swift
@@ -41,8 +41,10 @@ public func logSumExp(_ lhs: Tensor<Float>, _ rhs: Tensor<Float>) -> Tensor<Floa
 
 /// A storage mechanism for scoring inside a lattice.
 public struct SemiRing: Differentiable {
+
   /// The log likelihood.
   public var logp: Tensor<Float>
+
   /// The regularization factor.
   public var logr: Tensor<Float>
 
@@ -64,6 +66,7 @@ public struct SemiRing: Differentiable {
 
   /// The baseline score of zero.
   static var zero: SemiRing { SemiRing(logp: -Float.infinity, logr: -Float.infinity) }
+
   /// The baseline score of one.
   static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) }
 }
@@ -86,6 +89,7 @@ func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
 }
 
 extension Array where Element == SemiRing {
+
   /// Returns a sum of all scores in the collection.
   @differentiable
   func sum() -> SemiRing {
@@ -96,6 +100,7 @@ extension Array where Element == SemiRing {
 }
 
 extension SemiRing {
+
   /// The plain text description of this instance with score details.
   var shortDescription: String {
     "(\(logp), \(logr))"
@@ -103,6 +108,7 @@ extension SemiRing {
 }
 
 extension SemiRing {
+
   /// Returns true when `self` is within `tolerance` of `other`. This behavior
   /// is modeled after SE-0259.
   ///
diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift
index 06add8d981f..ac5eb66aee4 100644
--- a/Support/Text/WordSeg/Alphabet.swift
+++ b/Support/Text/WordSeg/Alphabet.swift
@@ -20,6 +20,7 @@ import TensorFlow
 ///
 /// - Note: We map from String in order to support multi-character metadata sequences such as `</s>`.
 public struct Alphabet {
+
   /// A type whose instances represent a character.
   public typealias Element = String
 
@@ -28,8 +29,10 @@ public struct Alphabet {
 
   /// A marker denoting the end of a sequence.
   public let eos: Int32
+
   /// A marker denoting the end of a word.
   public let eow: Int32
+
   /// A marker used for padding inside a sequence.
   public let pad: Int32
 
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index d4c4101afba..5dcc08a0147 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -16,8 +16,10 @@ import TensorFlow
 
 /// A sequence of characters represented by integers.
 public struct CharacterSequence: Hashable {
+
   /// A collection of integers representing a sequence of characters.
   public let characters: [Int32]
+
   /// A marker denoting the end of the sequence.
   private let eos: Int32
 
@@ -82,8 +84,10 @@ public struct CharacterSequence: Hashable {
 
   /// Count of characters in the sequence, including the end marker.
   public var count: Int { return characters.count }
+
   /// The last character in the sequence, i.e. the end marker.
   public var last: Int32? { return characters.last }
+
   /// TODO: what's happening here?
   public var tensor: Tensor<Int32> {
     Tensor<Int32>([self.eos] + characters[0..<characters.count - 1])
@@ -91,6 +95,7 @@ public struct CharacterSequence: Hashable {
 }
 
 extension CharacterSequence: CustomStringConvertible {
+
   /// A string representation of the collection of integers representing the character sequence.
   public var description: String {
     "\(characters)"
@@ -103,6 +108,7 @@ public enum CharacterErrors: Error {
 }
 
 extension CharacterErrors: CustomStringConvertible {
+
   /// A description of the error with all included details.
   public var description: String {
     switch self {
diff --git a/Support/Text/WordSeg/Lexicon.swift b/Support/Text/WordSeg/Lexicon.swift
index 6f706b1a186..9ca4f568a2d 100644
--- a/Support/Text/WordSeg/Lexicon.swift
+++ b/Support/Text/WordSeg/Lexicon.swift
@@ -20,6 +20,7 @@ import TensorFlow
 /// (in contrast to the character vocabulary or `Alphabet`, which maps
 /// characters to integers).
 public struct Lexicon {
+
   /// A type whose instances represent a sequence of characters.
   public typealias Element = CharacterSequence
 

From 711b3c80aa0c507a566cfefe2326a1bd484c347e Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 20:12:48 +0000
Subject: [PATCH 13/30] Clarify more summaries.

Remove explicit parameter descriptions and add them to summaries.
Conform parameter names to Swift conventions.
---
 Benchmarks/Models/WordSeg.swift               |  10 +-
 Examples/WordSeg/main.swift                   |  16 +-
 Models/Text/WordSeg/Lattice.swift             |  83 +++--------
 Models/Text/WordSeg/Model.swift               | 138 ++++++++----------
 Models/Text/WordSeg/SemiRing.swift            |  21 +--
 Support/Text/WordSeg/Alphabet.swift           |  16 +-
 Support/Text/WordSeg/CharacterSequence.swift  |  33 ++---
 Support/Text/WordSeg/Lexicon.swift            |  25 +---
 Tests/SupportTests/WordSegSupportTests.swift  |   2 +-
 .../WordSegmentationTests/ProbeLayers.swift   |  32 ++--
 10 files changed, 153 insertions(+), 223 deletions(-)

diff --git a/Benchmarks/Models/WordSeg.swift b/Benchmarks/Models/WordSeg.swift
index 74cf1280601..127fd918372 100644
--- a/Benchmarks/Models/WordSeg.swift
+++ b/Benchmarks/Models/WordSeg.swift
@@ -106,14 +106,14 @@ struct WordSegBenchmark: Benchmark {
               from: [sentence],
               alphabet: dataset.alphabet,
               maxLength: maximumSequenceLength,
-              minFreq: 10
+              minFrequency: 10
             )
             
             let modelParameters = SNLM.Parameters(
-              ndim: 512,
-              dropoutProb: 0.5,
-              chrVocab: dataset.alphabet,
-              strVocab: lexicon,
+              hiddenSize: 512,
+              dropoutProbability: 0.5,
+              alphabet: dataset.alphabet,
+              lexicon: lexicon,
               order: 5
             )
             
diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift
index a09db0ce51f..004ba4517c9 100644
--- a/Examples/WordSeg/main.swift
+++ b/Examples/WordSeg/main.swift
@@ -18,9 +18,9 @@ import TensorFlow
 import TextModels
 
 // Model flags
-let ndim = 512  // Hidden unit size.
+let hiddenSize = 512  // Hidden unit size.
 // Training flags
-let dropoutProb = 0.5  // Dropout rate.
+let dropoutProbability = 0.5  // Dropout rate.
 let order = 5  // Power of length penalty.
 let maxEpochs = 1000  // Maximum number of training epochs.
 var trainingLossHistory = [Float]()  // Keep track of loss.
@@ -30,7 +30,7 @@ let learningRate: Float = 1e-3  // Initial learning rate.
 let lambd: Float = 0.00075  // Weight of length penalty.
 // Lexicon flags.
 let maxLength = 10  // Maximum length of a string.
-let minFreq = 10  // Minimum frequency of a string.
+let minFrequency = 10  // Minimum frequency of a string.
 
 // Load user-provided data files.
 let dataset: WordSegDataset
@@ -55,14 +55,14 @@ let lexicon = Lexicon(
   from: sequences,
   alphabet: dataset.alphabet,
   maxLength: maxLength,
-  minFreq: minFreq
+  minFrequency: minFrequency
 )
 
 let modelParameters = SNLM.Parameters(
-  ndim: ndim,
-  dropoutProb: dropoutProb,
-  chrVocab: dataset.alphabet,
-  strVocab: lexicon,
+  hiddenSize: hiddenSize,
+  dropoutProbability: dropoutProbability,
+  alphabet: dataset.alphabet,
+  lexicon: lexicon,
   order: order
 )
 
diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift
index ddd5168883d..ccecfe53dc1 100644
--- a/Models/Text/WordSeg/Lattice.swift
+++ b/Models/Text/WordSeg/Lattice.swift
@@ -52,17 +52,11 @@ public struct Lattice: Differentiable {
     /// The expected total score for this segmentation.
     public var totalScore: SemiRing
 
-    /// Creates an edge for `sentence` between `start` and `end` node
-    /// positions. Sets the log probability to `logp` and uses this value to
-    /// calculate the score. Sums the score with `previous` to determine the
-    /// total score.
+    /// Creates an edge for `sentence` between `start` and `end`.
     ///
-    /// - Parameter start: the position of the start node.
-    /// - Parameter end: the position of the end node.
-    /// - Parameter sentence: the character sequence.
-    /// - Parameter logp: the log likelihood.
-    /// - Parameter previous: the total score of the preceding edge.
-    /// - Parameter order: the power of the length penalty.
+    /// Uses the log probability `logp` and the power of the length penalty
+    /// `order` to calculate the regularization factor and form the current
+    /// score. Sums this score with `previous` to determine the total score.
     @differentiable
     init(
       start: Int, end: Int, sentence: CharacterSequence, logp: Tensor<Float>,
@@ -81,15 +75,8 @@ public struct Lattice: Differentiable {
       self.totalScore = self.score * previous
     }
 
-    /// Creates an edge for `string` between `start` and `end` node
-    /// positions. Sets the log probability, score, and total score.
-    ///
-    /// - Parameter start: the position of the start node.
-    /// - Parameter end: the position of the end node.
-    /// - Parameter string: the character sequence.
-    /// - Parameter logp: the log likelihood.
-    /// - Parameter score: the current score.
-    /// - Parameter totalScore: the total score.
+    /// Creates an edge for `string` between `start` and `end` and sets the
+    /// log probability `logp`, `score`, and `totalScore`.
     @differentiable
     public init(
       start: Int, end: Int, string: CharacterSequence, logp: Tensor<Float>,
@@ -104,9 +91,10 @@ public struct Lattice: Differentiable {
     }
   }
 
-  /// Represents a word boundary. When a lattice is built, a start node is
-  /// created, followed by one for every character in the sequence,
-  /// representing every potential boundary.
+  /// Represents a word boundary.
+  ///
+  /// When a lattice is built, a start node is created, followed by one for
+  /// every character in the sequence, representing every potential boundary.
   ///
   /// - Note: Scores are only meaningful in relation to incoming edges and the
   ///   start node has no incoming edges.
@@ -127,13 +115,8 @@ public struct Lattice: Differentiable {
     /// Creates an empty instance.
     init() {}
 
-    /// Creates a node preceded by `bestEdge`. Stores `bestScore` and
-    /// `semiringScore`. Sets incoming edges to `edges`.
-    ///
-    /// - Parameter bestEdge: the best incoming edge.
-    /// - Parameter bestScore: the score of the best incoming edge.
-    /// - Parameter edges: the incoming edges.
-    /// - Parameter semiringScore: the composite score of all incoming edges.
+    /// Creates a node preceded by `bestEdge`, sets incoming edges to
+    /// `edges`, and stores `bestScore` and `semiringScore`.
     @differentiable
     public init(
       bestEdge: Edge?, bestScore: Float, edges: [Edge],
@@ -145,21 +128,21 @@ public struct Lattice: Differentiable {
       self.semiringScore = semiringScore
     }
 
-    /// Calculates the semiring score by summing the total score of all edges.
+    /// Returns a sum of the total score of all incoming edges.
     @differentiable
     func computeSemiringScore() -> SemiRing {
       // TODO: Reduceinto and +=
       edges.differentiableMap { $0.totalScore }.sum()
     }
 
-    /// Calculates the current semiring score and sets `semiringScore`.
+    /// Calculates and sets the current semiring score.
     @differentiable
     mutating func recomputeSemiringScore() {
       semiringScore = computeSemiringScore()
     }
   }
 
-  /// An ordered collection of nodes.
+  /// Represents the position of word boundaries.
   var positions: [Node]
 
   /// Accesses the node at the `index`th position.
@@ -177,24 +160,16 @@ public struct Lattice: Differentiable {
   }
 
   /// Creates an empty instance with a start node, followed by `count` nodes.
-  ///
-  /// - Parameter count: the length of the lattice, e.g. number of characters
-  ///   in the sequence.
   init(count: Int) {
     positions = Array(repeating: Node(), count: count + 1)
   }
 
   /// Creates an instance with the nodes in `positions`.
-  ///
-  /// - Parameter positions: the nodes composing the lattice.
   public init(positions: [Node]) {
     self.positions = positions
   }
 
-  /// Returns a set of edges with the best total score. Traversing this path
-  /// produces a segmented version of `sentence`.
-  ///
-  /// - Parameter sentence: the text to be segmented.
+  /// Returns the path representing the best segmentation of `sentence`.
   public mutating func viterbi(sentence: CharacterSequence) -> [Edge] {
     // Forward pass
     // Starts at 1 since the 0 node has no incoming edges.
@@ -224,11 +199,9 @@ public struct Lattice: Differentiable {
     return bestPath.reversed()
   }
 
-  /// Returns the plain text encoded in `path`, e.g. the segmentation of the
-  /// full character sequence.
+  /// Returns the plain text encoded in `path`, using `alphabet`.
   ///
-  /// - Parameter path: a lattice path.
-  /// - Parameter alphabet: the alphabet used in path creation.
+  /// This represents the segmentation of the full character sequence.
   public static func pathToPlainText(path: [Edge], alphabet: Alphabet) -> String {
     var plainText = [String]()
     for edge in path {
@@ -283,11 +256,9 @@ extension Lattice.Edge: CustomStringConvertible {
 extension Lattice {
 
   /// Returns true when all nodes in `self` are within `tolerance` of all
-  /// nodes in `other`. This behavior is modeled after SE-0259.
+  /// nodes in `other`.
   ///
-  /// - Parameter other: the instance to be compared with `self`.
-  /// - Parameter tolerance: the amount of variability considered acceptable
-  ///   in determining equality.
+  /// - Note: This behavior is modeled after SE-0259.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     guard self.positions.count == other.positions.count else {
       print("positions count mismatch: \(self.positions.count) != \(other.positions.count)")
@@ -308,12 +279,9 @@ extension Lattice {
 extension Lattice.Node {
 
   /// Returns true when all properties and edges in `self` are within
-  /// `tolerance` of all properties and edges in `other`. This behavior is
-  /// modeled after SE-0259.
+  /// `tolerance` of all properties and edges in `other`.
   ///
-  /// - Parameter other: the instance to be compared with `self`.
-  /// - Parameter tolerance: the amount of variability considered acceptable
-  ///   in determining equality.
+  /// - Note: This behavior is modeled after SE-0259.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     guard self.edges.count == other.edges.count else { return false }
 
@@ -338,12 +306,9 @@ extension Lattice.Node {
 extension Lattice.Edge {
 
   /// Returns true when the log likelihood and scores in `self` are within
-  /// `tolerance` of the log likelihood and scores in `other`. This behavior
-  /// is modeled after SE-0259.
+  /// `tolerance` of the log likelihood and scores in `other`.
   ///
-  /// - Parameter other: the instance to be compared with `self`.
-  /// - Parameter tolerance: the amount of variability considered acceptable
-  ///   in determining equality.
+  /// - Note: This behavior is modeled after SE-0259.
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
     return self.start == other.start && self.end == other.end
       // TODO: figure out why the string equality is being ignored
diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift
index b5c341f97f0..03e608be7cf 100644
--- a/Models/Text/WordSeg/Model.swift
+++ b/Models/Text/WordSeg/Model.swift
@@ -31,34 +31,33 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   public struct Parameters {
 
     /// The hidden unit size.
-    public var ndim: Int
+    public var hiddenSize: Int
 
     /// The dropout rate.
-    public var dropoutProb: Double
+    public var dropoutProbability: Double
 
-    /// The character vocabulary.
-    public var chrVocab: Alphabet
+    /// The union of characters used in this model.
+    public var alphabet: Alphabet
 
-    /// The string vocabulary.
-    public var strVocab: Lexicon
+    /// Contiguous sequences of characters encountered in the training data.
+    public var lexicon: Lexicon
 
     /// The power of the length penalty.
     public var order: Int
 
-    /// Creates an instance with `ndim` hidden units, `dropoutProb` dropout
-    /// rate, `chrVocab` alphabet, `strVocab` lexicon, and `order` power of
-    /// length penalty.
+    /// Creates an instance with `hiddenSize` units, `dropoutProbability`
+    /// rate, `alphabet`, `lexicon`, and `order` power of length penalty.
     public init(
-      ndim: Int,
-      dropoutProb: Double,
-      chrVocab: Alphabet,
-      strVocab: Lexicon,
+      hiddenSize: Int,
+      dropoutProbability: Double,
+      alphabet: Alphabet,
+      lexicon: Lexicon,
       order: Int
     ) {
-      self.ndim = ndim
-      self.dropoutProb = dropoutProb
-      self.chrVocab = chrVocab
-      self.strVocab = strVocab
+      self.hiddenSize = hiddenSize
+      self.dropoutProbability = dropoutProbability
+      self.alphabet = alphabet
+      self.lexicon = lexicon
       self.order = order
     }
   }
@@ -104,55 +103,51 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   // MARK: - Initializer
 
   /// Creates an instance with the configuration defined by `parameters`.
-  ///
-  /// - Parameter parameters: the model configuration.
   public init(parameters: Parameters) {
     self.parameters = parameters
 
     // Encoder
     self.encoderEmbedding = Embedding(
-      vocabularySize: parameters.chrVocab.count,
-      embeddingSize: parameters.ndim)
+      vocabularySize: parameters.alphabet.count,
+      embeddingSize: parameters.hiddenSize)
     self.encoderLSTM = LSTM(
       LSTMCell(
-        inputSize: parameters.ndim,
+        inputSize: parameters.hiddenSize,
         hiddenSize:
-          parameters.ndim))
+          parameters.hiddenSize))
 
     // Interpolation weight
     self.mlpInterpolation = MLP(
-      nIn: parameters.ndim,
-      nHidden: parameters.ndim,
-      nOut: 2,
-      dropoutProbability: parameters.dropoutProb)
+      inputSize: parameters.hiddenSize,
+      hiddenSize: parameters.hiddenSize,
+      outputSize: 2,
+      dropoutProbability: parameters.dropoutProbability)
 
     // Lexical memory
     self.mlpMemory = MLP(
-      nIn: parameters.ndim,
-      nHidden: parameters.ndim,
-      nOut: parameters.strVocab.count,
-      dropoutProbability: parameters.dropoutProb)
+      inputSize: parameters.hiddenSize,
+      hiddenSize: parameters.hiddenSize,
+      outputSize: parameters.lexicon.count,
+      dropoutProbability: parameters.dropoutProbability)
 
     // Character-level decoder
     self.decoderEmbedding = Embedding(
-      vocabularySize: parameters.chrVocab.count,
-      embeddingSize: parameters.ndim)
+      vocabularySize: parameters.alphabet.count,
+      embeddingSize: parameters.hiddenSize)
     self.decoderLSTM = LSTM(
       LSTMCell(
-        inputSize: parameters.ndim,
+        inputSize: parameters.hiddenSize,
         hiddenSize:
-          parameters.ndim))
-    self.decoderDense = Dense(inputSize: parameters.ndim, outputSize: parameters.chrVocab.count)
+          parameters.hiddenSize))
+    self.decoderDense = Dense(inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count)
 
     // Other layers
-    self.dropout = Dropout(probability: parameters.dropoutProb)
+    self.dropout = Dropout(probability: parameters.dropoutProbability)
   }
 
   // MARK: - Encode
 
   /// Returns the hidden states of the encoder LSTM applied to `x`.
-  ///
-  /// - Parameter x: the character sequence to encode.
   public func encode(_ x: CharacterSequence) -> [Tensor<Float>] {
     var embedded = encoderEmbedding(x.tensor)
     embedded = dropout(embedded)
@@ -165,10 +160,8 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
   // MARK: - Decode
 
-  /// Returns the log probabilities for each of the candidates.
-  ///
-  /// - Parameter candidates: the character sequences to decode.
-  /// - Parameter state: the hidden state from the encoder LSTM.
+  /// Returns the log probabilities for each sequence in `candidates`, given
+  /// hidden `state` from the encoder LSTM.
   public func decode(_ candidates: [CharacterSequence], _ state: Tensor<Float>) -> Tensor<Float> {
     // TODO(TF-433): Remove closure workaround when autodiff supports non-active rethrowing
     // functions (`Array.map`).
@@ -176,16 +169,16 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
     var xBatch: [Int32] = []
     var yBatch: [Int32] = []
     for candidate in candidates {
-      let padding = Array(repeating: parameters.chrVocab.pad, count: maxLen - candidate.count - 1)
+      let padding = Array(repeating: parameters.alphabet.pad, count: maxLen - candidate.count - 1)
 
       // x is </w>{sentence}{padding}
-      xBatch.append(parameters.chrVocab.eow)
+      xBatch.append(parameters.alphabet.eow)
       xBatch.append(contentsOf: candidate.characters)
       xBatch.append(contentsOf: padding)
 
       // y is {sentence}</w>{padding}
       yBatch.append(contentsOf: candidate.characters)
-      yBatch.append(parameters.chrVocab.eow)
+      yBatch.append(parameters.alphabet.eow)
       yBatch.append(contentsOf: padding)
     }
 
@@ -194,26 +187,26 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
     let x: Tensor<Int32> = Tensor(shape: [candidates.count, maxLen], scalars: xBatch).transposed()
     let y: Tensor<Int32> = Tensor(shape: [candidates.count, maxLen], scalars: yBatch).transposed()
 
-    // [time x batch x ndim]
+    // [time x batch x hiddenSize]
     var embeddedX = decoderEmbedding(x)
     embeddedX = dropout(embeddedX)
 
-    // [batch x ndim]
+    // [batch x hiddenSize]
     let stateBatch = state.rankLifted().tiled(multiples: Tensor([Int32(candidates.count), 1]))
 
-    // [time] array of LSTM states whose `hidden` and `cell` fields have shape [batch x ndim]
+    // [time] array of LSTM states whose `hidden` and `cell` fields have shape [batch x hiddenSize]
     let decoderStates = decoderLSTM(
       embeddedX.unstacked(),
       initialState: LSTMCell.State(
         cell: Tensor(zeros: stateBatch.shape),
         hidden: stateBatch))
 
-    // [time x batch x ndim]
+    // [time x batch x hiddenSize]
     var decoderResult = Tensor(
       stacking: decoderStates.differentiableMap { $0.hidden })
     decoderResult = dropout(decoderResult)
 
-    // [time x batch x chrVocab.count]
+    // [time x batch x alphabet.count]
     let logits = decoderDense(decoderResult)
 
     // [time x batch]
@@ -226,7 +219,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
       ).reshaped(to: y.shape)
 
     // [time x batch]
-    let logpExcludingPad = logp * Tensor<Float>(y .!= parameters.chrVocab.pad)
+    let logpExcludingPad = logp * Tensor<Float>(y .!= parameters.alphabet.pad)
 
     // [batch]
     let candidateLogP = logpExcludingPad.transposed().sum(squeezingAxes: 1)
@@ -236,25 +229,16 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
   // MARK: - buildLattice
 
-  /// Returns the log likelihood for `candidate` from the lexical memory
+  /// Returns the log probability for `candidate` from the lexical memory
   /// `logp_lex`.
-  ///
-  /// - Parameter logp_lex: all log likelihoods in the lexical memory.
-  /// - Parameter candidate: the character sequence for which to retrieve the
-  ///   log likelihood.
   func get_logp_lex(_ logp_lex: Tensor<Float>, _ candidate: CharacterSequence) -> Tensor<Float> {
-    guard let index = parameters.strVocab.dictionary[candidate] else {
+    guard let index = parameters.lexicon.dictionary[candidate] else {
       return Tensor(-Float.infinity)
     }
     return logp_lex[Int(index)]
   }
 
-  /// Returns a complete lattice for `sentence` with a maximum length of
-  /// `maxLen`.
-  ///
-  /// - Parameter sentence: the character sequence used for determining
-  ///   segmentation.
-  /// - Parameter maxLen: the maximum allowable sequence length.
+  /// Returns a lattice for `sentence` with `maxLen` maximum sequence length.
   @differentiable
   public func buildLattice(_ sentence: CharacterSequence, maxLen: Int) -> Lattice {
     var lattice = Lattice(count: sentence.count)
@@ -267,12 +251,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
         // TODO: avoid copies?
         let candidate =
           CharacterSequence(
-            alphabet: parameters.chrVocab,
+            alphabet: parameters.alphabet,
             characters: sentence[pos..<pos + span])
         // TODO(TF-433): Use `Bool.&&` instead of nested if statements when autodiff supports
         // non-active rethrowing functions (`Bool.&&`).
         if candidate.count != 1 {
-          if candidate.last == parameters.chrVocab.eos {
+          if candidate.last == parameters.alphabet.eos {
             // Prohibit strings such as ["t", "h", "e", "</s>"]
             continue
           }
@@ -282,7 +266,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
       let current_state = states[pos]
       let logg = logg_batch[pos].identityADHack  // [2]
-      let logp_lex = logp_lex_batch[pos].identityADHack  // [strVocab.chr.count]
+      let logp_lex = logp_lex_batch[pos].identityADHack  // [lexicon.chr.count]
       let logp_chr = decode(candidates, current_state).identityADHack  // [candidates.count]
       if pos != 0 {
         // Cleanup: lattice[pos].recomputeSemiringScore()
@@ -322,8 +306,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
 extension Array {
 
-  /// Sets the `index`th element of `self` to `value`. Semantically, it
-  /// behaves like `Array.subscript.set`.
+  /// Sets the `index`th element of `self` to `value`.
+  ///
+  /// Semantically, this function behaves like `Array.subscript.set`.
   ///
   /// - Note: this mutating method exists as a workaround for
   ///   `Array.subscript._modify` not being differentiable (TF-1277).
@@ -361,17 +346,12 @@ public struct MLP: Layer {
   /// The second dense layer.
   public var dense2: Dense<Float>
 
-  /// Creates an instance with input size `nIn`, `nHidden` hidden units,
-  /// dropout probability `dropoutProbability` and output size `nOut`.
-  ///
-  /// - Parameter nIn: input size.
-  /// - Parameter nHidden: number of hidden units.
-  /// - Parameter nOut: output size.
-  /// - Parameter dropoutProbability: probability that an input is dropped.
-  public init(nIn: Int, nHidden: Int, nOut: Int, dropoutProbability: Double) {
-    dense1 = Dense(inputSize: nIn, outputSize: nHidden, activation: tanh)
+  /// Creates an instance with `inputSize`, `hiddenSize`,
+  /// `dropoutProbability`, and `outputSize`.
+  public init(inputSize: Int, hiddenSize: Int, outputSize: Int, dropoutProbability: Double) {
+    dense1 = Dense(inputSize: inputSize, outputSize: hiddenSize, activation: tanh)
     dropout = Dropout(probability: dropoutProbability)
-    dense2 = Dense(inputSize: nHidden, outputSize: nOut, activation: logSoftmax)
+    dense2 = Dense(inputSize: hiddenSize, outputSize: outputSize, activation: logSoftmax)
   }
 
   /// Returns the result of applying all three layers in sequence to `input`.
diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift
index 0c7809b10c9..2eaa787eaac 100644
--- a/Models/Text/WordSeg/SemiRing.swift
+++ b/Models/Text/WordSeg/SemiRing.swift
@@ -23,7 +23,9 @@ import TensorFlow
 #endif
 
 /// Returns a single tensor containing the log of the sum of the exponentials
-/// in `x`. Used for numerical stability when dealing with very small values.
+/// in `x`.
+///
+/// Used for numerical stability when dealing with very small values.
 @differentiable
 public func logSumExp(_ x: [Tensor<Float>]) -> Tensor<Float> {
   // Deal with an empty array first.
@@ -32,8 +34,9 @@ public func logSumExp(_ x: [Tensor<Float>]) -> Tensor<Float> {
 }
 
 /// Returns a single tensor containing the log of the sum of the exponentials
-/// in `lhs` and `rhs`. Used for numerical stability when dealing with very
-/// small values.
+/// in `lhs` and `rhs`.
+///
+/// Used for numerical stability when dealing with very small values.
 @differentiable
 public func logSumExp(_ lhs: Tensor<Float>, _ rhs: Tensor<Float>) -> Tensor<Float> {
   return logSumExp([lhs, rhs])
@@ -71,8 +74,9 @@ public struct SemiRing: Differentiable {
   static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) }
 }
 
-/// Multiplies `lhs` by `rhs`. Since scores are on a logarithmic scale,
-/// products become sums.
+/// Multiplies `lhs` by `rhs`.
+///
+/// Since scores are on a logarithmic scale, products become sums.
 @differentiable
 func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing {
   return SemiRing(
@@ -109,12 +113,9 @@ extension SemiRing {
 
 extension SemiRing {
 
-  /// Returns true when `self` is within `tolerance` of `other`. This behavior
-  /// is modeled after SE-0259.
+  /// Returns true when `self` is within `tolerance` of `other`.
   ///
-  /// - Parameter other: the instance to be compared with `self`.
-  /// - Parameter tolerance: the amount of variability considered acceptable
-  ///   in determining equality.
+  /// - Note: This behavior is modeled after SE-0259.
   // TODO(abdulras) see if we can use ulp as a default tolerance
   @inlinable
   public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool {
diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift
index ac5eb66aee4..96a5356914b 100644
--- a/Support/Text/WordSeg/Alphabet.swift
+++ b/Support/Text/WordSeg/Alphabet.swift
@@ -14,11 +14,13 @@
 
 import TensorFlow
 
-/// A collection that maps individual characters to an integer representation.
+/// A mapping between individual characters and their integer representation.
 ///
-/// In Python implementations, this is sometimes called the character vocabulary.
+/// - Note: We map from String in order to support multi-character metadata
+///   sequences such as `</s>`.
 ///
-/// - Note: We map from String in order to support multi-character metadata sequences such as `</s>`.
+/// - Note: In Python implementations, this is sometimes called the character
+///   vocabulary.
 public struct Alphabet {
 
   /// A type whose instances represent a character.
@@ -37,7 +39,7 @@ public struct Alphabet {
   public let pad: Int32
 
   /// Creates an instance containing a mapping from `letters` to unique
-  /// integers, including markers.
+  /// integers, including markers `eos`, `eow`, and `pad`.
   public init<C: Collection>(_ letters: C, eos: String, eow: String, pad: String)
   where C.Element == Character {
     self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...))
@@ -53,7 +55,7 @@ public struct Alphabet {
   }
 
   /// Creates an instance containing a mapping from `letters` to unique
-  /// integers, including markers.
+  /// integers, including markers `eos`, `eow`, and `pad`.
   public init<C: Collection>(_ letters: C, eos: String, eow: String, pad: String)
   where C.Element == Element {
     self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...))
@@ -68,10 +70,10 @@ public struct Alphabet {
     self.dictionary[pad] = self.pad
   }
 
-  /// A count of the characters in the alphabet, including markers.
+  /// A count of unique characters, including markers.
   public var count: Int { return dictionary.count }
 
-  /// Accesses the `key`th element.
+  /// Accesses the `key`th element, returning `nil` if it does not exist.
   public subscript(key: String) -> Int32? {
     return dictionary[key]
   }
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index 5dcc08a0147..010518df476 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -17,26 +17,23 @@ import TensorFlow
 /// A sequence of characters represented by integers.
 public struct CharacterSequence: Hashable {
 
-  /// A collection of integers representing a sequence of characters.
+  /// Representing an ordered sequence of characters.
   public let characters: [Int32]
 
   /// A marker denoting the end of the sequence.
   private let eos: Int32
 
-  /// Creates an instance without meaningful contents.
+  /// Creates an empty instance without meaningful contents.
   public init(_debug: Int) {
     self.characters = []
     self.eos = -1
   }
 
-  /// Creates a sequence from `string`, using the integers from `alphabet`,
-  /// appended with the end of sequence marker.
+  /// Creates a sequence from `string`, using `alphabet`, appended with the
+  /// end of sequence marker.
   ///
-  /// - Parameter alphabet: character to integer mapping.
-  /// - Parameter appendingEoSTo: string to be converted to a sequence of
-  ///   integers.
-  ///
-  /// - Throws: An error of type 'CharacterErrors'.
+  /// - Throws: `CharacterErrors.unknownCharacter` if `string` contains a
+  ///   character that does not exist in `alphabet`.
   public init(alphabet: Alphabet, appendingEoSTo string: String) throws {
     var characters = [Int32]()
     characters.reserveCapacity(string.count + 1)
@@ -51,22 +48,14 @@ public struct CharacterSequence: Hashable {
   }
 
   /// Creates a sequence from `characters` and sets the end of sequence marker
-  ///  from `alphabet`.
-  ///
-  /// - Parameter alphabet: character to integer mapping.
-  /// - Parameter characters: sequence of integers with a terminal end of
-  ///   sequence marker.
+  /// from `alphabet`.
   private init(alphabet: Alphabet, characters: [Int32]) {
     self.characters = characters
     self.eos = alphabet.eos
   }
 
-  /// Creates a sequenxe from `characters` and sets the end of sequence marker
+  /// Creates a sequence from `characters` and sets the end of sequence marker
   /// from `alphabet`.
-  ///
-  /// - Parameter alphabet: character to integer mapping.
-  /// - Parameter characters: sequence of integers with a terminal end of
-  ///   sequence marker.
   public init(alphabet: Alphabet, characters: ArraySlice<Int32>) {
     self.characters = [Int32](characters)
     self.eos = alphabet.eos
@@ -85,7 +74,9 @@ public struct CharacterSequence: Hashable {
   /// Count of characters in the sequence, including the end marker.
   public var count: Int { return characters.count }
 
-  /// The last character in the sequence, i.e. the end marker.
+  /// The last character in the sequence, if `characters` is not empty.
+  ///
+  /// - Note: This is usually the end marker.
   public var last: Int32? { return characters.last }
 
   /// TODO: what's happening here?
@@ -96,7 +87,7 @@ public struct CharacterSequence: Hashable {
 
 extension CharacterSequence: CustomStringConvertible {
 
-  /// A string representation of the collection of integers representing the character sequence.
+  /// A string representation of the integers in the character sequence.
   public var description: String {
     "\(characters)"
   }
diff --git a/Support/Text/WordSeg/Lexicon.swift b/Support/Text/WordSeg/Lexicon.swift
index 9ca4f568a2d..428f6dbe9b6 100644
--- a/Support/Text/WordSeg/Lexicon.swift
+++ b/Support/Text/WordSeg/Lexicon.swift
@@ -14,7 +14,7 @@
 
 import TensorFlow
 
-/// A collection that maps character sequences to logical words.
+/// Keeps track of logical words.
 ///
 /// In Python implementations, this is sometimes called the string vocabulary
 /// (in contrast to the character vocabulary or `Alphabet`, which maps
@@ -24,35 +24,26 @@ public struct Lexicon {
   /// A type whose instances represent a sequence of characters.
   public typealias Element = CharacterSequence
 
-  /// A one-to-one mapping between a sequence of characters and unique
-  /// integers.
+  /// A one-to-one mapping between logical words and unique integers.
   // TODO(marcrasi): if the value is not used to construct Tensor, switch to Int
   public var dictionary: BijectiveDictionary<CharacterSequence, Int32>
 
   /// A count of unique logical words in the lexicon.
   public var count: Int { return dictionary.count }
 
-  /// Creates an instance containing a mapping from `sequences` to unique
-  /// integers.
-  ///
-  /// - Parameter sequences: character sequences to compose the lexicon.
+  /// Creates an instance containing `sequences`.
   public init<C: Collection>(_ sequences: C) where C.Element == Element {
     self.dictionary = .init(zip(sequences, 0...))
   }
 
-  /// Creates an instance containing a mapping from `sequences` to unique
-  /// integers, using `alphabet`. Sequences are truncated at `maxLength` and
-  /// only those occurring `minFreq` times are included.
-  ///
-  /// - Parameter sequences: character sequences to compose the lexicon.
-  /// - Parameter alphabet: all characters contained in `sequences`.
-  /// - Parameter maxLength: sequence length at which truncation occurs.
-  /// - Parameter minFreq: minimum required occurrence of each sequence.
+  /// Creates an instance containing `sequences` using `alphabet`, truncating
+  /// elements at `maxLength` and including only those appearing at least
+  /// `minFrequency` times.
   public init(
     from sequences: [CharacterSequence],
     alphabet: Alphabet,
     maxLength: Int,
-    minFreq: Int
+    minFrequency: Int
   ) {
     var histogram: [ArraySlice<Int32>: Int] = [:]
 
@@ -69,7 +60,7 @@ public struct Lexicon {
       }
     }
 
-    let frequentWordCandidates = histogram.filter { $0.1 >= minFreq }
+    let frequentWordCandidates = histogram.filter { $0.1 >= minFrequency }
     let vocab = frequentWordCandidates.map {
       CharacterSequence(alphabet: alphabet, characters: $0.0)
     }
diff --git a/Tests/SupportTests/WordSegSupportTests.swift b/Tests/SupportTests/WordSegSupportTests.swift
index 78406e6a6f2..c97feda32da 100644
--- a/Tests/SupportTests/WordSegSupportTests.swift
+++ b/Tests/SupportTests/WordSegSupportTests.swift
@@ -72,7 +72,7 @@ class WordSegSupportTests: XCTestCase {
         try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "alpha"),
         try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "beta"),
         try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "gamma"),
-      ], alphabet: alphabet, maxLength: 5, minFreq: 4)
+      ], alphabet: alphabet, maxLength: 5, minFrequency: 4)
 
     XCTAssertEqual(lexicon.count, 3)
   }
diff --git a/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift b/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift
index a9337bdb10c..815d8bd0c97 100644
--- a/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift
+++ b/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift
@@ -123,42 +123,42 @@ func almostEqual(
 
 class WordSegProbeLayerTests: XCTestCase {
   func testProbeEncoder() {
-    // chrVocab is:
+    // alphabet is:
     // 0 - a
     // 1 - b
     // 2 - </s>
     // 3 - </w>
     // 4 - <pad>
-    let chrVocab: Alphabet = Alphabet(
+    let alphabet: Alphabet = Alphabet(
       [
         "a",
         "b",
       ], eos: "</s>", eow: "</w>", pad: "<pad>")
 
-    // strVocab is:
+    // lexicon is:
     // 0 - aaaa
     // 1 - bbbb
     // 2 - abab
-    let strVocab: Lexicon = Lexicon([
-      CharacterSequence(alphabet: chrVocab, characters: [0, 0]),  // "aa"
-      CharacterSequence(alphabet: chrVocab, characters: [1, 1]),  // "bb"
-      CharacterSequence(alphabet: chrVocab, characters: [0, 1]),  // "ab"
-      CharacterSequence(alphabet: chrVocab, characters: [1, 0]),  // "ba"
+    let lexicon: Lexicon = Lexicon([
+      CharacterSequence(alphabet: alphabet, characters: [0, 0]),  // "aa"
+      CharacterSequence(alphabet: alphabet, characters: [1, 1]),  // "bb"
+      CharacterSequence(alphabet: alphabet, characters: [0, 1]),  // "ab"
+      CharacterSequence(alphabet: alphabet, characters: [1, 0]),  // "ba"
     ])
 
     var model = SNLM(
       parameters: SNLM.Parameters(
-        ndim: 2,
-        dropoutProb: 0,
-        chrVocab: chrVocab,
-        strVocab: strVocab,
+        hiddenSize: 2,
+        dropoutProbability: 0,
+        alphabet: alphabet,
+        lexicon: lexicon,
         order: 5))
 
     model.setParameters(Example1.parameters)
 
     print("Encoding")
     let encoderStates = model.encode(
-      CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1]))  // "abab"
+      CharacterSequence(alphabet: alphabet, characters: [0, 1, 0, 1]))  // "abab"
     let encoderStatesTensor = Tensor(stacking: encoderStates)
     print("Expected: \(Example1.expectedEncoding)")
     print("Actual: \(encoderStatesTensor)")
@@ -184,8 +184,8 @@ class WordSegProbeLayerTests: XCTestCase {
     print("Decode")
     let decoded = model.decode(
       [
-        CharacterSequence(alphabet: chrVocab, characters: [0, 0, 0]),  // "aaa"
-        CharacterSequence(alphabet: chrVocab, characters: [0, 1]),  // "ab"
+        CharacterSequence(alphabet: alphabet, characters: [0, 0, 0]),  // "aaa"
+        CharacterSequence(alphabet: alphabet, characters: [0, 1]),  // "ab"
       ],
       encoderStates[0]
     )
@@ -195,7 +195,7 @@ class WordSegProbeLayerTests: XCTestCase {
     print("OK!\n")
 
     print("Build Lattice")
-    let abab = CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1])
+    let abab = CharacterSequence(alphabet: alphabet, characters: [0, 1, 0, 1])
     let lattice = model.buildLattice(abab, maxLen: 5)
     XCTAssert(lattice.isAlmostEqual(to: Example1.lattice, tolerance: 1e-5))
 

From 31fc09b3c9eda1513cd38c5ff9a989376014992c Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 16:26:37 -0400
Subject: [PATCH 14/30] Lint

---
 Models/Text/WordSeg/Model.swift | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift
index 03e608be7cf..39af6fb43d7 100644
--- a/Models/Text/WordSeg/Model.swift
+++ b/Models/Text/WordSeg/Model.swift
@@ -139,7 +139,8 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
         inputSize: parameters.hiddenSize,
         hiddenSize:
           parameters.hiddenSize))
-    self.decoderDense = Dense(inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count)
+    self.decoderDense = Dense(
+      inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count)
 
     // Other layers
     self.dropout = Dropout(probability: parameters.dropoutProbability)

From e80dfc2fd9ad0465f54de12c63f263ba5b3c34d2 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 20:34:30 +0000
Subject: [PATCH 15/30] Clarify lattice summary.

---
 Models/Text/WordSeg/Lattice.swift | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift
index ccecfe53dc1..5ede8495508 100644
--- a/Models/Text/WordSeg/Lattice.swift
+++ b/Models/Text/WordSeg/Lattice.swift
@@ -24,8 +24,9 @@ import TensorFlow
 #endif
 
 /// A structure used for scoring all possible segmentations of a character
-/// sequence. The path with the best score provides the most likely
-/// segmentation at inference.
+/// sequence.
+///
+/// The path with the best score provides the most likely segmentation.
 public struct Lattice: Differentiable {
 
   /// Represents a word.

From 09e82b5e18f1c0a3e4391246721770fb501e1f7c Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 2 Jun 2020 21:10:32 +0000
Subject: [PATCH 16/30] Summary refinement

---
 Support/Text/WordSeg/CharacterSequence.swift | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index 010518df476..3f35773686e 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -17,7 +17,7 @@ import TensorFlow
 /// A sequence of characters represented by integers.
 public struct CharacterSequence: Hashable {
 
-  /// Representing an ordered sequence of characters.
+  /// Represents an ordered sequence of characters.
   public let characters: [Int32]
 
   /// A marker denoting the end of the sequence.
@@ -30,7 +30,7 @@ public struct CharacterSequence: Hashable {
   }
 
   /// Creates a sequence from `string`, using `alphabet`, appended with the
-  /// end of sequence marker.
+  /// end marker.
   ///
   /// - Throws: `CharacterErrors.unknownCharacter` if `string` contains a
   ///   character that does not exist in `alphabet`.
@@ -47,15 +47,15 @@ public struct CharacterSequence: Hashable {
     self.init(alphabet: alphabet, characters: characters)
   }
 
-  /// Creates a sequence from `characters` and sets the end of sequence marker
-  /// from `alphabet`.
+  /// Creates a sequence from `characters` and sets the end marker from
+  /// `alphabet`.
   private init(alphabet: Alphabet, characters: [Int32]) {
     self.characters = characters
     self.eos = alphabet.eos
   }
 
-  /// Creates a sequence from `characters` and sets the end of sequence marker
-  /// from `alphabet`.
+  /// Creates a sequence from `characters` and sets the end marker from
+  /// `alphabet`.
   public init(alphabet: Alphabet, characters: ArraySlice<Int32>) {
     self.characters = [Int32](characters)
     self.eos = alphabet.eos

From d94a45e6b82a1e28c436e9d546640ab6aaca6bf2 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Fri, 5 Jun 2020 01:37:04 +0000
Subject: [PATCH 17/30] Clarify end marker behavior and assumptions

---
 Support/Text/WordSeg/CharacterSequence.swift | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
index 3f35773686e..30def797169 100644
--- a/Support/Text/WordSeg/CharacterSequence.swift
+++ b/Support/Text/WordSeg/CharacterSequence.swift
@@ -49,6 +49,8 @@ public struct CharacterSequence: Hashable {
 
   /// Creates a sequence from `characters` and sets the end marker from
   /// `alphabet`.
+  ///
+  /// - Note: Assumes `characters` contains an end marker.
   private init(alphabet: Alphabet, characters: [Int32]) {
     self.characters = characters
     self.eos = alphabet.eos
@@ -56,6 +58,8 @@ public struct CharacterSequence: Hashable {
 
   /// Creates a sequence from `characters` and sets the end marker from
   /// `alphabet`.
+  ///
+  /// - Note: Assumes `characters` contains an end marker.
   public init(alphabet: Alphabet, characters: ArraySlice<Int32>) {
     self.characters = [Int32](characters)
     self.eos = alphabet.eos
@@ -79,7 +83,8 @@ public struct CharacterSequence: Hashable {
   /// - Note: This is usually the end marker.
   public var last: Int32? { return characters.last }
 
-  /// TODO: what's happening here?
+  /// Representation for character generation, with the end marker moved to
+  /// the beginning.
   public var tensor: Tensor<Int32> {
     Tensor<Int32>([self.eos] + characters[0..<characters.count - 1])
   }

From 6881b58b727140cee643d0e0528abb4cc048199e Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 9 Jun 2020 15:50:18 +0000
Subject: [PATCH 18/30] Rename ReferenceArchive to DownloadableArchive

Change members to lets
---
 Datasets/WordSeg/WordSegDataset.swift | 32 +++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index f45a3374007..5c108134ceb 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -36,19 +36,19 @@ public struct WordSegDataset {
   public let alphabet: Alphabet
 
   /// A pointer to source data.
-  private struct ReferenceArchive {
+  private struct DownloadableArchive {
 
     /// The location of the archive.
-    var location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
+    let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
 
     /// The path to the test source.
-    var testingFilePath = "br/br-text/te.txt"
+    let testingFilePath = "br/br-text/te.txt"
 
     /// The path to the training source.
-    var trainingFilePath = "br/br-text/tr.txt"
+    let trainingFilePath = "br/br-text/tr.txt"
 
     /// The path to the validation source.
-    var validationFilePath = "br/br-text/va.txt"
+    let validationFilePath = "br/br-text/va.txt"
   }
 
   /// Returns the text of all phrases parsed from `data` in UTF8.
@@ -131,28 +131,28 @@ public struct WordSegDataset {
   /// - Throws: an error in the Cocoa domain, if the default training file
   ///   cannot be read.
   public init() throws {
-    let referenceArchive = ReferenceArchive()
+    let downloadableArchive = DownloadableArchive()
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
     WordSegDataset.downloadIfNotPresent(
-      to: localStorageDirectory, referenceArchive: referenceArchive)
+      to: localStorageDirectory, downloadableArchive: downloadableArchive)
 
     let archiveFileName =
-      referenceArchive
+      downloadableArchive
       .location.deletingPathExtension().lastPathComponent
     let archiveDirectory =
       localStorageDirectory
       .appendingPathComponent(archiveFileName)
     let trainingFilePath =
       archiveDirectory
-      .appendingPathComponent(referenceArchive.trainingFilePath).path
+      .appendingPathComponent(downloadableArchive.trainingFilePath).path
     let validationFilePath =
       archiveDirectory
-      .appendingPathComponent(referenceArchive.validationFilePath).path
+      .appendingPathComponent(downloadableArchive.validationFilePath).path
     let testingFilePath =
       archiveDirectory
-      .appendingPathComponent(referenceArchive.testingFilePath).path
+      .appendingPathComponent(downloadableArchive.testingFilePath).path
 
     try self.init(
       training: trainingFilePath, validation: validationFilePath,
@@ -226,10 +226,10 @@ public struct WordSegDataset {
     self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)
   }
 
-  /// Downloads and unpacks `referenceArchive` to `directory` if it does not
+  /// Downloads and unpacks `downloadableArchive` to `directory` if it does not
   /// exist locally.
   private static func downloadIfNotPresent(
-    to directory: URL, referenceArchive: ReferenceArchive
+    to directory: URL, downloadableArchive: DownloadableArchive
   ) {
     let downloadPath = directory.path
     let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
@@ -238,9 +238,9 @@ public struct WordSegDataset {
 
     guard !directoryExists || directoryEmpty else { return }
 
-    let remoteRoot = referenceArchive.location.deletingLastPathComponent()
-    let filename = referenceArchive.location.deletingPathExtension().lastPathComponent
-    let fileExtension = referenceArchive.location.pathExtension
+    let remoteRoot = downloadableArchive.location.deletingLastPathComponent()
+    let filename = downloadableArchive.location.deletingPathExtension().lastPathComponent
+    let fileExtension = downloadableArchive.location.pathExtension
 
     // Downloads and extracts dataset files.
     let _ = DatasetUtilities.downloadResource(

From c9ffce653ac1571bfdb073a27c0bcb63114bb93f Mon Sep 17 00:00:00 2001
From: Michelle Casbon <texasmichelle@users.noreply.github.com>
Date: Tue, 9 Jun 2020 08:52:05 -0700
Subject: [PATCH 19/30] Update Datasets/WordSeg/WordSegDataset.swift

Co-authored-by: Dave Abrahams <dabrahams@google.com>
---
 Datasets/WordSeg/WordSegDataset.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 5c108134ceb..29bdfb6d446 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -41,7 +41,7 @@ public struct WordSegDataset {
     /// The location of the archive.
     let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
 
-    /// The path to the test source.
+    /// The path to the test source within the unpacked archive.
     let testingFilePath = "br/br-text/te.txt"
 
     /// The path to the training source.

From 80e575a72a1d464bbc28b4c157d43aa3b674f805 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <texasmichelle@users.noreply.github.com>
Date: Tue, 9 Jun 2020 08:52:17 -0700
Subject: [PATCH 20/30] Update Datasets/WordSeg/WordSegDataset.swift

Co-authored-by: Dave Abrahams <dabrahams@google.com>
---
 Datasets/WordSeg/WordSegDataset.swift | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 29bdfb6d446..43456160f84 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -38,7 +38,8 @@ public struct WordSegDataset {
   /// A pointer to source data.
   private struct DownloadableArchive {
 
-    /// The location of the archive.
+    /// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked
+    /// into data files described by other properties of `self`. 
     let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
 
     /// The path to the test source within the unpacked archive.

From fb5e5d73c3b89d9493486d300beddd9c1740d2e4 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 9 Jun 2020 16:38:42 +0000
Subject: [PATCH 21/30] Remove implied text from comments with phrase.

---
 Datasets/WordSeg/WordSegDataset.swift | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 43456160f84..f2ddc367f2b 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -23,13 +23,13 @@ import ModelSupport
 /// https://www.aclweb.org/anthology/P19-1645.pdf.
 public struct WordSegDataset {
 
-  /// The text used for training.
+  /// The training data.
   public let trainingPhrases: [Phrase]
 
-  /// The text used for testing.
+  /// The test data.
   public private(set) var testingPhrases: [Phrase]?
 
-  /// The text used for validation.
+  /// The validation data.
   public private(set) var validationPhrases: [Phrase]?
 
   /// The union of all characters in the included dataset.
@@ -42,17 +42,17 @@ public struct WordSegDataset {
     /// into data files described by other properties of `self`. 
     let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
 
-    /// The path to the test source within the unpacked archive.
+    /// The path to the test data within the unpacked archive.
     let testingFilePath = "br/br-text/te.txt"
 
-    /// The path to the training source.
+    /// The path to the training data within the unpacked archive.
     let trainingFilePath = "br/br-text/tr.txt"
 
-    /// The path to the validation source.
+    /// The path to the validation data within the unpacked archive.
     let validationFilePath = "br/br-text/va.txt"
   }
 
-  /// Returns the text of all phrases parsed from `data` in UTF8.
+  /// Returns all phrases parsed from `data` in UTF8.
   private static func load(data: Data) -> [String] {
     guard let contents: String = String(data: data, encoding: .utf8) else {
       return []
@@ -60,7 +60,7 @@ public struct WordSegDataset {
     return load(contents: contents)
   }
 
-  /// Returns the text of all phrases from `contents`.
+  /// Returns all phrases from `contents`.
   private static func load(contents: String) -> [String] {
     var strings = [String]()
 
@@ -127,7 +127,7 @@ public struct WordSegDataset {
     return phrases
   }
 
-  /// Creates an instance containing phrases from the default location.
+  /// Creates an instance containing phrases from the reference archive.
   ///
   /// - Throws: an error in the Cocoa domain, if the default training file
   ///   cannot be read.

From 92b58e943d3be86db528b2e5d71f5036e1466e4c Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Tue, 9 Jun 2020 18:59:15 +0000
Subject: [PATCH 22/30] Remove Foundation string processing

Remove unnecessary additional `load()`
Remove unnecessary optional from `testingPhrases` and `validationPhrases`
Simplify optional filename handling in init()
Remove extra `)` from training loss output
Add test for loading only training file
---
 Datasets/WordSeg/WordSegDataset.swift         | 54 ++++++-------------
 Examples/WordSeg/main.swift                   |  8 +--
 .../WordSeg/WordSegDatasetTests.swift         | 31 ++++++++---
 3 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index f2ddc367f2b..979c5d64de8 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -27,10 +27,10 @@ public struct WordSegDataset {
   public let trainingPhrases: [Phrase]
 
   /// The test data.
-  public private(set) var testingPhrases: [Phrase]?
+  public private(set) var testingPhrases: [Phrase]
 
   /// The validation data.
-  public private(set) var validationPhrases: [Phrase]?
+  public private(set) var validationPhrases: [Phrase]
 
   /// The union of all characters in the included dataset.
   public let alphabet: Alphabet
@@ -52,24 +52,11 @@ public struct WordSegDataset {
     let validationFilePath = "br/br-text/va.txt"
   }
 
-  /// Returns all phrases parsed from `data` in UTF8.
+  /// Returns phrases parsed from `data` in UTF8, separated by newlines.
   private static func load(data: Data) -> [String] {
-    guard let contents: String = String(data: data, encoding: .utf8) else {
-      return []
-    }
-    return load(contents: contents)
-  }
-
-  /// Returns all phrases from `contents`.
-  private static func load(contents: String) -> [String] {
-    var strings = [String]()
-
-    for line in contents.components(separatedBy: .newlines) {
-      let trimmed = line.trimmingCharacters(in: .whitespaces)
-      if trimmed.isEmpty { continue }
-      strings.append(trimmed)
-    }
-    return strings
+    let contents = String(decoding: data, as: Unicode.UTF8.self)
+    let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true)
+    return splitContents.map { String($0) }
   }
 
   /// Returns the union of all characters in `training` and `otherSequences`.
@@ -113,7 +100,7 @@ public struct WordSegDataset {
     var phrases = [Phrase]()
 
     for data in dataset {
-      let trimmed = data.components(separatedBy: .whitespaces).joined()
+      let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
       guard
         let numericalizedText = try? CharacterSequence(
           alphabet: alphabet, appendingEoSTo: trimmed)
@@ -175,26 +162,15 @@ public struct WordSegDataset {
       options: .alwaysMapped)
     let training = Self.load(data: trainingData)
 
-    let validation: [String]
-    let testing: [String]
-
-    if let validationFile = validationFile {
-      let data = try Data(
-        contentsOf: URL(fileURLWithPath: validationFile),
-        options: .alwaysMapped)
-      validation = Self.load(data: data)
-    } else {
-      validation = [String]()
-    }
+    let validationData = try Data(
+      contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"),
+      options: .alwaysMapped)
+    let validation = Self.load(data: validationData)
 
-    if let testingFile = testingFile {
-      let data: Data = try Data(
-        contentsOf: URL(fileURLWithPath: testingFile),
-        options: .alwaysMapped)
-      testing = Self.load(data: data)
-    } else {
-      testing = [String]()
-    }
+    let testingData = try Data(
+      contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"),
+      options: .alwaysMapped)
+    let testing = Self.load(data: testingData)
 
     self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
     self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift
index c2b850c0589..10909cf744f 100644
--- a/Examples/WordSeg/main.swift
+++ b/Examples/WordSeg/main.swift
@@ -103,11 +103,11 @@ for epoch in 1...maxEpochs {
   trainingLossHistory.append(trainingLoss)
   reduceLROnPlateau(lossHistory: trainingLossHistory, optimizer: optimizer)
 
-  guard let validationPhrases = dataset.validationPhrases else {
+  if dataset.validationPhrases.count < 1 {
     print(
       """
       [Epoch \(epoch)] \
-      Training loss: \(trainingLoss))
+      Training loss: \(trainingLoss)
       """
     )
 
@@ -127,7 +127,7 @@ for epoch in 1...maxEpochs {
   var validationBatchCount = 0
   var validationCharacterCount = 0
   var validationPlainText: String = ""
-  for phrase in validationPhrases {
+  for phrase in dataset.validationPhrases {
     let sentence = phrase.numericalizedText
     var lattice = model.buildLattice(sentence, maxLen: maxLength)
     let score = lattice[sentence.count].semiringScore
@@ -137,7 +137,7 @@ for epoch in 1...maxEpochs {
     validationCharacterCount += sentence.count
 
     // View a sample segmentation once per epoch.
-    if validationBatchCount == validationPhrases.count {
+    if validationBatchCount == dataset.validationPhrases.count {
       let bestPath = lattice.viterbi(sentence: phrase.numericalizedText)
       validationPlainText = Lattice.pathToPlainText(path: bestPath, alphabet: dataset.alphabet)
     }
diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
index 06f073dd542..15cd2e6c75d 100644
--- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
+++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
@@ -17,12 +17,12 @@ import ModelSupport
 import XCTest
 
 class WordSegDatasetTests: XCTestCase {
-  func testCreateWordSegDataset() {
+  func testCreateWordSegDatasetReference() {
     do {
       let dataset = try WordSegDataset()
       XCTAssertEqual(dataset.trainingPhrases.count, 7832)
-      XCTAssertEqual(dataset.validationPhrases!.count, 979)
-      XCTAssertEqual(dataset.testingPhrases!.count, 979)
+      XCTAssertEqual(dataset.validationPhrases.count, 979)
+      XCTAssertEqual(dataset.testingPhrases.count, 979)
 
       // Check the first example in each set.
       let trainingExample: [Int32] = [
@@ -32,14 +32,32 @@ class WordSegDatasetTests: XCTestCase {
       XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample)
 
       let validationExample: [Int32] = [9, 6, 13, 13, 16, 14, 10, 14, 10, 28]
-      XCTAssertEqual(dataset.validationPhrases![0].numericalizedText.characters, validationExample)
+      XCTAssertEqual(dataset.validationPhrases[0].numericalizedText.characters, validationExample)
 
       let testingExample: [Int32] = [
         13, 6, 21, 14, 6, 20, 6, 6, 10, 7, 10, 4,
         2, 15, 20, 6, 6, 2, 15, 26, 3, 16, 5, 26, 10, 15, 21, 9, 2, 21, 14, 10,
         19, 19, 16, 19, 28,
       ]
-      XCTAssertEqual(dataset.testingPhrases![0].numericalizedText.characters, testingExample)
+      XCTAssertEqual(dataset.testingPhrases[0].numericalizedText.characters, testingExample)
+    } catch {
+      XCTFail(error.localizedDescription)
+    }
+  }
+
+  func testCreateWordSegDatasetTrainingOnly() {
+    do {
+      let dataset = try WordSegDataset(training: "/home/michellecasbon/tmp/seg/br/br-text/tr.txt")
+      XCTAssertEqual(dataset.trainingPhrases.count, 7832)
+      XCTAssertEqual(dataset.validationPhrases.count, 0)
+      XCTAssertEqual(dataset.testingPhrases.count, 0)
+
+      // Check the first example in each set.
+      let trainingExample: [Int32] = [
+        26, 16, 22, 24, 2, 15, 21, 21, 16, 20, 6, 6, 21,
+        9, 6, 3, 16, 16, 12, 28,
+      ]
+      XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample)
     } catch {
       XCTFail(error.localizedDescription)
     }
@@ -66,7 +84,8 @@ class WordSegDatasetTests: XCTestCase {
   }
 
   static var allTests = [
-    ("testCreateWordSegDataset", testCreateWordSegDataset),
+    ("testCreateWordSegDatasetReference", testCreateWordSegDatasetReference),
+    ("testCreateWordSegDatasetTrainingOnly", testCreateWordSegDatasetTrainingOnly),
     ("testWordSegDatasetLoad", testWordSegDatasetLoad),
   ]
 }

From 21fc998097ab3299a42845c8269ad4e7148a3001 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <texasmichelle@users.noreply.github.com>
Date: Tue, 9 Jun 2020 12:07:45 -0700
Subject: [PATCH 23/30] Update Datasets/WordSeg/WordSegDataset.swift

Co-authored-by: Dave Abrahams <dabrahams@google.com>
---
 Datasets/WordSeg/WordSegDataset.swift | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 979c5d64de8..c17074d127d 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -61,9 +61,9 @@ public struct WordSegDataset {
 
   /// Returns the union of all characters in `training` and `otherSequences`.
   ///
-  /// - Parameter eos: text to be used as the end of sequence marker.
-  /// - Parameter eow: text to be used as the end of word marker.
-  /// - Parameter pad: text to be used as the padding marker.
+  /// - Parameter eos: the end of sequence marker.
+  /// - Parameter eow:the end of word marker.
+  /// - Parameter pad: the padding marker.
   private static func makeAlphabet(
     datasets training: [String],
     _ otherSequences: [String]?...,

From cc3f30ea7755ec3697be928c3e5e781c8d79aeaf Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Wed, 10 Jun 2020 16:50:01 +0000
Subject: [PATCH 24/30] Remove variadic arguments in makeAlphabet

Simplify and remove redundant init code
---
 Datasets/WordSeg/WordSegDataset.swift | 39 +++++++--------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index c17074d127d..1f402aa00a9 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -59,26 +59,22 @@ public struct WordSegDataset {
     return splitContents.map { String($0) }
   }
 
-  /// Returns the union of all characters in `training` and `otherSequences`.
+  /// Returns the union of all characters in `phrases`.
   ///
   /// - Parameter eos: the end of sequence marker.
   /// - Parameter eow:the end of word marker.
   /// - Parameter pad: the padding marker.
   private static func makeAlphabet(
-    datasets training: [String],
-    _ otherSequences: [String]?...,
+    phrases: [String],
     eos: String = "</s>",
     eow: String = "</w>",
     pad: String = "</pad>"
   ) -> Alphabet {
     var letters: Set<Character> = []
 
-    for dataset in otherSequences + [training] {
-      guard let dataset = dataset else { continue }
-      for sentence in dataset {
-        for character in sentence {
-          if !character.isWhitespace { letters.insert(character) }
-        }
+    for phrase in phrases {
+      for character in phrase {
+        if !character.isWhitespace { letters.insert(character) }
       }
     }
 
@@ -160,22 +156,17 @@ public struct WordSegDataset {
     let trainingData = try Data(
       contentsOf: URL(fileURLWithPath: trainingFile),
       options: .alwaysMapped)
-    let training = Self.load(data: trainingData)
 
     let validationData = try Data(
       contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"),
       options: .alwaysMapped)
-    let validation = Self.load(data: validationData)
 
     let testingData = try Data(
       contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"),
       options: .alwaysMapped)
-    let testing = Self.load(data: testingData)
 
-    self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
-    self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)
+    self.init(
+      training: trainingData, validation: validationData, testing: testingData)
   }
 
   /// Creates an instance containing phrases from `trainingData`, and
@@ -184,20 +175,10 @@ public struct WordSegDataset {
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
   ) {
     let training = Self.load(data: trainingData)
-    let validation: [String]
-    let testing: [String]
-    if let validationData = validationData {
-      validation = Self.load(data: validationData)
-    } else {
-      validation = [String]()
-    }
-    if let testingData = testingData {
-      testing = Self.load(data: testingData)
-    } else {
-      testing = [String]()
-    }
+    let validation = Self.load(data: validationData ?? Data())
+    let testing = Self.load(data: testingData ?? Data())
 
-    self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
+    self.alphabet = Self.makeAlphabet(phrases: training + validation + testing)
     self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
     self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet)
     self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)

From 361e3805a2745887f24df77d775ad3793bb4520b Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 11 Jun 2020 15:47:44 +0000
Subject: [PATCH 25/30] Rename convertDataset to numericalizeDataset

---
 Datasets/WordSeg/WordSegDataset.swift | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 1f402aa00a9..686e61db7a1 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -85,12 +85,11 @@ public struct WordSegDataset {
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
 
-  /// Returns phrases from `dataset`, using `alphabet`, to be used with the
+  /// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the
   /// WordSeg model.
   ///
-  /// - Note: Omits any part of the dataset that cannot be converted to
-  ///   `CharacterSequence`.
-  private static func convertDataset(_ dataset: [String], alphabet: Alphabet)
+  /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
+  private static func numericalizeDataset(_ dataset: [String], alphabet: Alphabet)
     -> [Phrase]
   {
     var phrases = [Phrase]()
@@ -179,9 +178,9 @@ public struct WordSegDataset {
     let testing = Self.load(data: testingData ?? Data())
 
     self.alphabet = Self.makeAlphabet(phrases: training + validation + testing)
-    self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet)
-    self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet)
+    self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet)
   }
 
   /// Downloads and unpacks `downloadableArchive` to `directory` if it does not

From d6546ea63418ed2e5433c266451f6a78c44aaea7 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 11 Jun 2020 17:30:25 +0000
Subject: [PATCH 26/30] Remove raw loop in makeAlphabet

Rename downloadableArchive to source
Preserve intermediate array type
---
 Datasets/WordSeg/WordSegDataset.swift | 45 +++++++++++----------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 686e61db7a1..1795f5262c1 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -53,10 +53,10 @@ public struct WordSegDataset {
   }
 
   /// Returns phrases parsed from `data` in UTF8, separated by newlines.
-  private static func load(data: Data) -> [String] {
+  private static func load(data: Data) -> [Substring] {
     let contents = String(decoding: data, as: Unicode.UTF8.self)
     let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true)
-    return splitContents.map { String($0) }
+    return splitContents
   }
 
   /// Returns the union of all characters in `phrases`.
@@ -65,22 +65,15 @@ public struct WordSegDataset {
   /// - Parameter eow:the end of word marker.
   /// - Parameter pad: the padding marker.
   private static func makeAlphabet(
-    phrases: [String],
+    phrases: [Substring],
     eos: String = "</s>",
     eow: String = "</w>",
     pad: String = "</pad>"
   ) -> Alphabet {
-    var letters: Set<Character> = []
-
-    for phrase in phrases {
-      for character in phrase {
-        if !character.isWhitespace { letters.insert(character) }
-      }
-    }
+    let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace })
 
     // Sort the letters to make it easier to interpret ints vs letters.
-    var sorted = Array(letters)
-    sorted.sort()
+    let sorted = Array(letters).sorted()
 
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
@@ -89,7 +82,7 @@ public struct WordSegDataset {
   /// WordSeg model.
   ///
   /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
-  private static func numericalizeDataset(_ dataset: [String], alphabet: Alphabet)
+  private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet)
     -> [Phrase]
   {
     var phrases = [Phrase]()
@@ -101,7 +94,7 @@ public struct WordSegDataset {
           alphabet: alphabet, appendingEoSTo: trimmed)
       else { continue }
       let phrase = Phrase(
-        plainText: data,
+        plainText: String(data),
         numericalizedText: numericalizedText)
       phrases.append(phrase)
     }
@@ -114,28 +107,26 @@ public struct WordSegDataset {
   /// - Throws: an error in the Cocoa domain, if the default training file
   ///   cannot be read.
   public init() throws {
-    let downloadableArchive = DownloadableArchive()
+    let source = DownloadableArchive()
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
     WordSegDataset.downloadIfNotPresent(
-      to: localStorageDirectory, downloadableArchive: downloadableArchive)
+      to: localStorageDirectory, source: source)
 
-    let archiveFileName =
-      downloadableArchive
-      .location.deletingPathExtension().lastPathComponent
+    let archiveFileName = source.location.deletingPathExtension().lastPathComponent
     let archiveDirectory =
       localStorageDirectory
       .appendingPathComponent(archiveFileName)
     let trainingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadableArchive.trainingFilePath).path
+      .appendingPathComponent(source.trainingFilePath).path
     let validationFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadableArchive.validationFilePath).path
+      .appendingPathComponent(source.validationFilePath).path
     let testingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadableArchive.testingFilePath).path
+      .appendingPathComponent(source.testingFilePath).path
 
     try self.init(
       training: trainingFilePath, validation: validationFilePath,
@@ -183,10 +174,10 @@ public struct WordSegDataset {
     self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet)
   }
 
-  /// Downloads and unpacks `downloadableArchive` to `directory` if it does not
+  /// Downloads and unpacks `source` to `directory` if it does not
   /// exist locally.
   private static func downloadIfNotPresent(
-    to directory: URL, downloadableArchive: DownloadableArchive
+    to directory: URL, source: DownloadableArchive
   ) {
     let downloadPath = directory.path
     let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
@@ -195,9 +186,9 @@ public struct WordSegDataset {
 
     guard !directoryExists || directoryEmpty else { return }
 
-    let remoteRoot = downloadableArchive.location.deletingLastPathComponent()
-    let filename = downloadableArchive.location.deletingPathExtension().lastPathComponent
-    let fileExtension = downloadableArchive.location.pathExtension
+    let remoteRoot = source.location.deletingLastPathComponent()
+    let filename = source.location.deletingPathExtension().lastPathComponent
+    let fileExtension = source.location.pathExtension
 
     // Downloads and extracts dataset files.
     let _ = DatasetUtilities.downloadResource(

From e94247796f71c4af4d080d8a4b17d408ce7c63e3 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <texasmichelle@users.noreply.github.com>
Date: Thu, 11 Jun 2020 10:34:05 -0700
Subject: [PATCH 27/30] Update Datasets/WordSeg/WordSegDataset.swift

Co-authored-by: Dave Abrahams <dabrahams@google.com>
---
 Datasets/WordSeg/WordSegDataset.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 1795f5262c1..20b9aa0bf39 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -32,7 +32,7 @@ public struct WordSegDataset {
   /// The validation data.
   public private(set) var validationPhrases: [Phrase]
 
-  /// The union of all characters in the included dataset.
+  /// A mapping between characters used in the dataset and densly-packed integers
   public let alphabet: Alphabet
 
   /// A pointer to source data.

From 6e0ae1543f9bce9e5afad232567d6417832fd626 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 11 Jun 2020 17:35:07 +0000
Subject: [PATCH 28/30] s/densly/densely/

---
 Datasets/WordSeg/WordSegDataset.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index 20b9aa0bf39..e4bcdb380bf 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -32,7 +32,7 @@ public struct WordSegDataset {
   /// The validation data.
   public private(set) var validationPhrases: [Phrase]
 
-  /// A mapping between characters used in the dataset and densly-packed integers
+  /// A mapping between characters used in the dataset and densely-packed integers
   public let alphabet: Alphabet
 
   /// A pointer to source data.

From ac6436fa012229f40920422147728007974c3aa9 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Thu, 11 Jun 2020 18:05:25 +0000
Subject: [PATCH 29/30] Remove hard-coded path

---
 Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
index 15cd2e6c75d..09ca58e6545 100644
--- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
+++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift
@@ -47,7 +47,10 @@ class WordSegDatasetTests: XCTestCase {
 
   func testCreateWordSegDatasetTrainingOnly() {
     do {
-      let dataset = try WordSegDataset(training: "/home/michellecasbon/tmp/seg/br/br-text/tr.txt")
+      let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
+        .appendingPathComponent("WordSeg", isDirectory: true)
+      let trainingFile = localStorageDirectory.appendingPathComponent("/seg/br/br-text/tr.txt")
+      let dataset = try WordSegDataset(training: trainingFile.path)
       XCTAssertEqual(dataset.trainingPhrases.count, 7832)
       XCTAssertEqual(dataset.validationPhrases.count, 0)
       XCTAssertEqual(dataset.testingPhrases.count, 0)

From 070d745d3f0b0453387e3a7b04baedefac5201f6 Mon Sep 17 00:00:00 2001
From: Michelle Casbon <michellecasbon@google.com>
Date: Fri, 12 Jun 2020 22:27:57 +0000
Subject: [PATCH 30/30] Replace `WordSegDataset` with `Self`

---
 Datasets/WordSeg/WordSegDataset.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
index e4bcdb380bf..8bb166d7aa6 100644
--- a/Datasets/WordSeg/WordSegDataset.swift
+++ b/Datasets/WordSeg/WordSegDataset.swift
@@ -111,7 +111,7 @@ public struct WordSegDataset {
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
-    WordSegDataset.downloadIfNotPresent(
+    Self.downloadIfNotPresent(
       to: localStorageDirectory, source: source)
 
     let archiveFileName = source.location.deletingPathExtension().lastPathComponent