From 3f7a6f011823c78dca6da47f81e4a6e7a14d80cb Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Wed, 27 May 2020 01:11:52 +0000 Subject: [PATCH 01/30] Add documentation --- Support/Text/WordSeg/Alphabet.swift | 19 ++++++-- Support/Text/WordSeg/CharacterSequence.swift | 49 +++++++++++++++++++- Support/Text/WordSeg/Lexicon.swift | 40 ++++++++-------- 3 files changed, 84 insertions(+), 24 deletions(-) diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift index 52d6672ef7e..273513436df 100644 --- a/Support/Text/WordSeg/Alphabet.swift +++ b/Support/Text/WordSeg/Alphabet.swift @@ -14,20 +14,29 @@ import TensorFlow -/// Alphabet maps from characters in a string to Int32 representations. -/// -/// Note: we map from String in order to support multi-character metadata sequences such as . +/// A collection that maps individual characters to an integer representation. /// /// In Python implementations, this is sometimes called the character vocabulary. +/// +/// - Note: We map from String in order to support multi-character metadata sequences such as ``. public struct Alphabet { + /// A type whose instances represent a character. public typealias Element = String + /// A one-to-one mapping between a set of characters and a unique integer. public var dictionary: BijectiveDictionary + /// A marker denoting the end of a sequence. public let eos: Int32 + + /// A marker denoting the end of a word. public let eow: Int32 + + /// A marker used for padding inside a sequence. public let pad: Int32 + /// Creates an instance containing a mapping from `letters` to unique + /// integers, including markers. public init(_ letters: C, eos: String, eow: String, pad: String) where C.Element == Character { self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...)) @@ -42,6 +51,8 @@ public struct Alphabet { self.dictionary[pad] = self.pad } + /// Creates an instance containing a mapping from `letters` to unique + /// integers, including markers. public init(_ letters: C, eos: String, eow: String, pad: String) where C.Element == Element { self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...)) @@ -56,8 +67,10 @@ public struct Alphabet { self.dictionary[pad] = self.pad } + /// A count of the characters in the alphabet, including markers. public var count: Int { return dictionary.count } + /// Accesses the `key`th element. public subscript(key: String) -> Int32? { return dictionary[key] } diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index 0a39c0b31c5..d5190d28c71 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -14,16 +14,25 @@ import TensorFlow -/// An Int32-based representation of a string to be used with the WordSeg model. +/// A sequence of characters represented by integers. public struct CharacterSequence: Hashable { + /// A collection of integers representing a sequence of characters. public let characters: [Int32] + /// A marker denoting the end of the sequence. private let eos: Int32 + /// Creates an instance without meaningful contents. public init(_debug: Int) { self.characters = [] self.eos = -1 } + /// Creates a sequence from `string`, using the integers from `alphabet`, + /// appended with the end of sequence marker. + /// + /// - Parameter alphabet: character to integer mapping. + /// - Parameter appendingEoSTo: string to be converted to a sequence of + /// integers. public init(alphabet: Alphabet, appendingEoSTo string: String) throws { var characters = [Int32]() characters.reserveCapacity(string.count + 1) @@ -37,33 +46,71 @@ public struct CharacterSequence: Hashable { self.init(alphabet: alphabet, characters: characters) } + /// Creates a sequence from `characters` and sets the end of sequence marker + /// from `alphabet`. + /// + /// - Parameter alphabet: character to integer mapping. + /// - Parameter characters: sequence of integers with a terminal end of + /// sequence marker. private init(alphabet: Alphabet, characters: [Int32]) { self.characters = characters self.eos = alphabet.eos } + /// Creates a sequenxe from `characters` and sets the end of sequence marker + /// from `alphabet`. + /// + /// - Parameter alphabet: character to integer mapping. + /// - Parameter characters: sequence of integers with a terminal end of + /// sequence marker. public init(alphabet: Alphabet, characters: ArraySlice) { self.characters = [Int32](characters) self.eos = alphabet.eos } + /// Accesses the `index`th character. public subscript(index: Int32) -> Int32 { return characters[Int(index)] } + /// Accesses characters within `range`. public subscript(range: Range) -> ArraySlice { return characters[range] } + /// Count of characters in the sequence, including the end marker. public var count: Int { return characters.count } + /// The last character in the sequence, i.e. the end marker. public var last: Int32? { return characters.last } + /// TODO: what's happening here? public var tensor: Tensor { Tensor([self.eos] + characters[0.. + /// A count of unique logical words in the lexicon. public var count: Int { return dictionary.count } + /// Creates an instance containing a mapping from `sequences` to unique + /// integers. + /// + /// - Parameter sequences: character sequences to compose the lexicon. public init(_ sequences: C) where C.Element == Element { self.dictionary = .init(zip(sequences, 0...)) } + /// Creates an instance containing a mapping from `sequences` to unique + /// integers, using `alphabet`. Sequences are truncated at `maxLength` and + /// only those occurring `minFreq` times are included. + /// + /// - Parameter sequences: character sequences to compose the lexicon. + /// - Parameter alphabet: all characters contained in `sequences`. + /// - Parameter maxLength: sequence length at which truncation occurs. + /// - Parameter minFreq: minimum required occurrence of each sequence. public init( from sequences: [CharacterSequence], alphabet: Alphabet, @@ -59,20 +76,3 @@ public struct Lexicon { self.init(vocab) } } - -public enum CharacterErrors: Error { - case unknownCharacter(character: Character, index: Int, sentence: String) - case nonUtf8Data -} - -extension CharacterErrors: CustomStringConvertible { - public var description: String { - switch self { - case let .unknownCharacter(character, index, sentence): - return - "Unknown character '\(character)' encountered at index \(index) while converting sentence \"\(sentence)\" to a character sequence." - case .nonUtf8Data: - return "Non-UTF8 data encountered." - } - } -} From 2d42404a122eaff45547f406664ebdaaa440488c Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 26 May 2020 21:19:36 -0400 Subject: [PATCH 02/30] Lint --- Support/Text/WordSeg/CharacterSequence.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index d5190d28c71..2eb4f6714bb 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -95,7 +95,6 @@ extension CharacterSequence: CustomStringConvertible { } } - /// An error that can be encountered when processing characters. public enum CharacterErrors: Error { case unknownCharacter(character: Character, index: Int, sentence: String) From bdf71a9fbba4ac0fa1a5f0c7c9179f9da5f2f1be Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Wed, 27 May 2020 02:07:54 +0000 Subject: [PATCH 03/30] Add dataset files --- Datasets/WordSeg/WordSegDataset.swift | 55 +++++++++++++++++++++++++++ Datasets/WordSeg/WordSegRecord.swift | 8 ++++ 2 files changed, 63 insertions(+) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 260188173fe..93476f74488 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -15,21 +15,37 @@ import Foundation import ModelSupport +/// A collection of raw and processed text used for training and validation +/// of word segmentation models. public struct WordSegDataset { + /// A collection of text used for training. public let training: [WordSegRecord] + /// A collection of text used for testing. public private(set) var testing: [WordSegRecord]? + /// A collection of text used for validation. public private(set) var validation: [WordSegRecord]? + /// The set of characters found in all included texts. public let alphabet: Alphabet + /// Details used for downloading source data. private struct DownloadDetails { + /// The location of the archive. var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")! + /// The basename of the archive. var archiveFileName = "seg" + /// The extension of the archive. var archiveExtension = "zip" + /// The path to the test source. var testingFilePath = "br/br-text/te.txt" + /// The path to the training source. var trainingFilePath = "br/br-text/tr.txt" + /// The path to the validation source. var validationFilePath = "br/br-text/va.txt" } + /// Returns a list of records parsed from `data` in UTF8. + /// + /// - Parameter data: text in UTF8 format. private static func load(data: Data) throws -> [String] { guard let contents: String = String(data: data, encoding: .utf8) else { throw CharacterErrors.nonUtf8Data @@ -37,6 +53,10 @@ public struct WordSegDataset { return load(contents: contents) } + /// Separates `contents` into a collection of strings by newlines, trimming + /// leading and trailing whitespace and excluding blank lines. + /// + /// - Parameter contents: text to be separated by newline. private static func load(contents: String) -> [String] { var strings = [String]() @@ -48,6 +68,15 @@ public struct WordSegDataset { return strings } + /// Returns an alphabet composed of all characters found in `training` and + /// `otherSequences`. + /// + /// - Parameter training: full text of the training data. + /// - Parameter otherSequences: optional full text of the validation and + /// test data. + /// - Parameter eos: text to be used as the end of sequence marker. + /// - Parameter eow: text to be used as the end of word marker. + /// - Parameter pad: text to be used as the padding marker. private static func makeAlphabet( datasets training: [String], _ otherSequences: [String]?..., @@ -73,6 +102,10 @@ public struct WordSegDataset { return Alphabet(sorted, eos: eos, eow: eow, pad: pad) } + /// Creates a collection of records to be used with the WordSeg model. + /// + /// - Parameter dataset: text to be converted. + /// - Parameter alphabet: set of all characters used in `dataset`. private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws -> [WordSegRecord] { @@ -84,6 +117,12 @@ public struct WordSegDataset { alphabet: alphabet, appendingEoSTo: trimmed)) } } + + /// Returns a collection of records to be used with the WordSeg model, or + /// `nil` if `dataset` is empty. + /// + /// - Parameter dataset: text to be converted. + /// - Parameter alphabet: set of all characters used in `dataset`. private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws -> [WordSegRecord]? { @@ -94,6 +133,8 @@ public struct WordSegDataset { return nil } + /// Creates an instance containing `WordSegRecords` from the default + /// location. public init() throws { let downloadDetails = DownloadDetails() let localStorageDirectory: URL = FileManager.default.temporaryDirectory @@ -119,6 +160,11 @@ public struct WordSegDataset { testing: testingFilePath) } + /// Creates an instance containing `WordSegRecords` from the given files. + /// + /// - Parameter training: path to the file containing training data. + /// - Parameter validation: path to the file containing validation data. + /// - Parameter testing: path to the file containing test data. public init( training trainingFile: String, validation validationFile: String? = nil, @@ -151,6 +197,11 @@ public struct WordSegDataset { self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) } + /// Creates an instance containing `WordSegRecords` from the given data. + /// + /// - Parameter training: contents of the training data. + /// - Parameter validation: contents of the validation data. + /// - Parameter testing: contents of the test data. public init( training trainingData: Data, validation validationData: Data?, testing testingData: Data? ) @@ -172,6 +223,10 @@ public struct WordSegDataset { self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) } + /// Downloads and unpacks the source archive if it does not exist locally. + /// + /// - Parameter directory: local directory to store files. + /// - Parameter downloadDetails: where to find the source archive. private static func downloadIfNotPresent( to directory: URL, downloadDetails: DownloadDetails ) { diff --git a/Datasets/WordSeg/WordSegRecord.swift b/Datasets/WordSeg/WordSegRecord.swift index d0049a2a2c2..8445477a1eb 100644 --- a/Datasets/WordSeg/WordSegRecord.swift +++ b/Datasets/WordSeg/WordSegRecord.swift @@ -14,10 +14,18 @@ import ModelSupport +/// A sequence of text for use in word segmentation. public struct WordSegRecord { + /// A raw, unprocessed sequence of text. public let plainText: String + /// A sequence of text in numeric form, derived from `plainText`. public let numericalizedText: CharacterSequence + /// Creates an instance containing both raw and processed forms of a + /// sequence of text. + /// + /// - Parameter plainText: raw, unprocessed text. + /// - Parameter numericalizedText: processed text in numeric form. public init(plainText: String, numericalizedText: CharacterSequence) { self.plainText = plainText self.numericalizedText = numericalizedText From d44b5bf3177273a119fac0a5391d5a91cce85c95 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 28 May 2020 16:56:52 +0000 Subject: [PATCH 04/30] Add lattice --- Models/Text/WordSeg/Lattice.swift | 107 +++++++++++++++++++++++++--- Support/Text/WordSeg/Alphabet.swift | 2 - 2 files changed, 96 insertions(+), 13 deletions(-) diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift index 3a2babdc25d..cda03ba5218 100644 --- a/Models/Text/WordSeg/Lattice.swift +++ b/Models/Text/WordSeg/Lattice.swift @@ -23,23 +23,40 @@ import TensorFlow import Glibc #endif -/// Lattice -/// -/// Represents the lattice used by the WordSeg algorithm. +/// A structure used for scoring all possible segmentations of a character +/// sequence. The path with the best score provides the most likely +/// segmentation at inference. public struct Lattice: Differentiable { - /// Edge + /// Represents a word. /// - /// Represents an Edge + /// At each character position, an edge is constructed for every possible + /// segmentation of the preceding portion of the sequence. public struct Edge: Differentiable { + /// The node position immediately preceding this edge. @noDerivative public var start: Int + /// The node position immediately following this edge. @noDerivative public var end: Int + /// The characters composing a word. @noDerivative public var string: CharacterSequence + /// The log likelihood of this segmentation. public var logp: Tensor - // expectation + /// The expected score for this segmentation. public var score: SemiRing + /// The expected total score for this segmentation. public var totalScore: SemiRing + /// Creates an edge for `sentence` between `start` and `end` node + /// positions. Sets the log probability to `logp` and uses this value to + /// calculate the score. Sums the score with `previous` to determine the + /// total score. + /// + /// - Parameter start: the position of the start node. + /// - Parameter end: the position of the end node. + /// - Parameter sentence: the character sequence. + /// - Parameter logp: the log likelihood. + /// - Parameter previous: the total score of the preceding edge. + /// - Parameter order: the power of the length penalty. @differentiable init( start: Int, end: Int, sentence: CharacterSequence, logp: Tensor, @@ -58,6 +75,15 @@ public struct Lattice: Differentiable { self.totalScore = self.score * previous } + /// Creates an edge for `string` between `start` and `end` node + /// positions. Sets the log probability, score, and total score. + /// + /// - Parameter start: the position of the start node. + /// - Parameter end: the position of the end node. + /// - Parameter string: the character sequence. + /// - Parameter logp: the log likelihood. + /// - Parameter score: the current score. + /// - Parameter totalScore: the total score. @differentiable public init( start: Int, end: Int, string: CharacterSequence, logp: Tensor, @@ -72,17 +98,32 @@ public struct Lattice: Differentiable { } } - /// Node + /// Represents a word boundary. When a lattice is built, a start node is + /// created, followed by one for every character in the sequence, + /// representing every potential boundary. /// - /// Represents a node in the lattice + /// - Note: Scores are only meaningful in relation to incoming edges and the + /// start node has no incoming edges. public struct Node: Differentiable { + /// The incoming edge with the highest score. @noDerivative public var bestEdge: Edge? + /// The score of the best incoming edge. public var bestScore: Float = 0.0 + /// All incoming edges. public var edges = [Edge]() + /// A composite score of all incoming edges. public var semiringScore: SemiRing = SemiRing.one + /// Creates an empty instance. init() {} + /// Creates a node preceded by `bestEdge`. Stores `bestScore` and + /// `semiringScore`. Sets incoming edges to `edges`. + /// + /// - Parameter bestEdge: the best incoming edge. + /// - Parameter bestScore: the score of the best incoming edge. + /// - Parameter edges: the incoming edges. + /// - Parameter semiringScore: the composite score of all incoming edges. @differentiable public init( bestEdge: Edge?, bestScore: Float, edges: [Edge], @@ -94,20 +135,24 @@ public struct Lattice: Differentiable { self.semiringScore = semiringScore } + /// Calculates the semiring score by summing the total score of all edges. @differentiable func computeSemiringScore() -> SemiRing { // TODO: Reduceinto and += edges.differentiableMap { $0.totalScore }.sum() } + /// Calculates the current semiring score and sets `semiringScore`. @differentiable mutating func recomputeSemiringScore() { semiringScore = computeSemiringScore() } } + /// An ordered collection of nodes. var positions: [Node] + /// Accesses the node at the `index`th position. @differentiable public subscript(index: Int) -> Node { get { return positions[index] } @@ -121,16 +166,28 @@ public struct Lattice: Differentiable { // _modify { yield &positions[index] } } + /// Creates an empty instance with a start node, followed by `count` nodes. + /// + /// - Parameter count: the length of the lattice, e.g. number of characters + /// in the sequence. init(count: Int) { positions = Array(repeating: Node(), count: count + 1) } + /// Creates an instance with the nodes in `positions`. + /// + /// - Parameter positions: the nodes composing the lattice. public init(positions: [Node]) { self.positions = positions } + /// Returns a set of edges with the best total score. Traversing this path + /// produces a segmented version of `sentence`. + /// + /// - Parameter sentence: the text to be segmented. public mutating func viterbi(sentence: CharacterSequence) -> [Edge] { - // Forwards pass + // Forward pass + // Starts at 1 since the 0 node has no incoming edges. for position in 1...sentence.count { var bestScore = -Float.infinity var bestEdge: Edge! @@ -145,7 +202,7 @@ public struct Lattice: Differentiable { self[position].bestEdge = bestEdge } - // Backwards + // Backward pass var bestPath: [Edge] = [] var nextEdge = self[sentence.count].bestEdge! while nextEdge.start != 0 { @@ -157,6 +214,11 @@ public struct Lattice: Differentiable { return bestPath.reversed() } + /// Returns the plain text encoded in `path`, e.g. the segmentation of the + /// full character sequence. + /// + /// - Parameter path: a lattice path. + /// - Parameter alphabet: the alphabet used in path creation. public static func pathToPlainText(path: [Edge], alphabet: Alphabet) -> String { var plainText = [String]() for edge in path { @@ -171,6 +233,7 @@ public struct Lattice: Differentiable { } extension Lattice: CustomStringConvertible { + /// The plain text description of this instance that describes all nodes. public var description: String { """ [ @@ -181,6 +244,8 @@ extension Lattice: CustomStringConvertible { } extension Lattice.Node: CustomStringConvertible { + /// The plain text description of this instance that describes all incoming + /// edges. public var description: String { var edgesStr: String if edges.isEmpty { @@ -196,13 +261,19 @@ extension Lattice.Node: CustomStringConvertible { } extension Lattice.Edge: CustomStringConvertible { + /// The plain text description of this instance with all edge details. public var description: String { "[\(start)->\(end)] logp: \(logp), score: \(score.shortDescription), total score: \(totalScore.shortDescription), sentence: \(string)" } } -/// SE-0259-esque equality with tolerance extension Lattice { + /// Returns true when all nodes in `self` are within `tolerance` of all + /// nodes in `other`. This behavior is modeled after SE-0259. + /// + /// - Parameter other: the instance to be compared with `self`. + /// - Parameter tolerance: the amount of variability considered acceptable + /// in determining equality. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { guard self.positions.count == other.positions.count else { print("positions count mismatch: \(self.positions.count) != \(other.positions.count)") @@ -221,6 +292,13 @@ extension Lattice { } extension Lattice.Node { + /// Returns true when all properties and edges in `self` are within + /// `tolerance` of all properties and edges in `other`. This behavior is + /// modeled after SE-0259. + /// + /// - Parameter other: the instance to be compared with `self`. + /// - Parameter tolerance: the amount of variability considered acceptable + /// in determining equality. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { guard self.edges.count == other.edges.count else { return false } @@ -243,6 +321,13 @@ extension Lattice.Node { } extension Lattice.Edge { + /// Returns true when the log likelihood and scores in `self` are within + /// `tolerance` of the log likelihood and scores in `other`. This behavior + /// is modeled after SE-0259. + /// + /// - Parameter other: the instance to be compared with `self`. + /// - Parameter tolerance: the amount of variability considered acceptable + /// in determining equality. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { return self.start == other.start && self.end == other.end // TODO: figure out why the string equality is being ignored diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift index 273513436df..06add8d981f 100644 --- a/Support/Text/WordSeg/Alphabet.swift +++ b/Support/Text/WordSeg/Alphabet.swift @@ -28,10 +28,8 @@ public struct Alphabet { /// A marker denoting the end of a sequence. public let eos: Int32 - /// A marker denoting the end of a word. public let eow: Int32 - /// A marker used for padding inside a sequence. public let pad: Int32 From cf86c8cfdc17e8cacbfcd024dce6bc3226ebcca7 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 28 May 2020 20:00:26 +0000 Subject: [PATCH 05/30] Add semiring --- Models/Text/WordSeg/SemiRing.swift | 35 +++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift index 6357c607557..c6854670036 100644 --- a/Models/Text/WordSeg/SemiRing.swift +++ b/Models/Text/WordSeg/SemiRing.swift @@ -22,9 +22,8 @@ import TensorFlow import Glibc #endif -/// logSumExp(_:) -/// -/// logSumExp (see https://en.wikipedia.org/wiki/LogSumExp) +/// Returns a single tensor containing the log of the sum of the exponentials +/// in `x`. Used for numerical stability when dealing with very small values. @differentiable public func logSumExp(_ x: [Tensor]) -> Tensor { // Deal with an empty array first. @@ -32,37 +31,45 @@ public func logSumExp(_ x: [Tensor]) -> Tensor { return Tensor(stacking: x).logSumExp() } -/// logSumExp(_:_:) -/// -/// Specialized logSumExp for 2 tensor of floats. +/// Returns a single tensor containing the log of the sum of the exponentials +/// in `lhs` and `rhs`. Used for numerical stability when dealing with very +/// small values. @differentiable public func logSumExp(_ lhs: Tensor, _ rhs: Tensor) -> Tensor { return logSumExp([lhs, rhs]) } -/// SemiRing -/// -/// Represents a SemiRing +/// A storage mechanism for scoring inside a lattice. public struct SemiRing: Differentiable { + /// The log likelihood. public var logp: Tensor + /// The regularization factor. public var logr: Tensor + /// Creates an instance with log likelihood `logp` and regularization + /// factor `logr`. @differentiable public init(logp: Tensor, logr: Tensor) { self.logp = logp self.logr = logr } + /// Creates an instance with log likelihood `logp` and regularization + /// factor `logr`. @differentiable public init(logp: Float, logr: Float) { self.logp = Tensor(logp) self.logr = Tensor(logr) } + /// The baseline score of zero. static var zero: SemiRing { SemiRing(logp: -Float.infinity, logr: -Float.infinity) } + /// The baseline score of one. static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) } } +/// Multiplies `lhs` by `rhs`. Since scores are on a logarithmic scale, +/// products become sums. @differentiable func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { return SemiRing( @@ -70,6 +77,7 @@ func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { logr: logSumExp(lhs.logp + rhs.logr, rhs.logp + lhs.logr)) } +/// Sums `lhs` by `rhs`. @differentiable func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { return SemiRing( @@ -78,6 +86,7 @@ func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { } extension Array where Element == SemiRing { + /// Returns a sum of all scores in the collection. @differentiable func sum() -> SemiRing { return SemiRing( @@ -87,13 +96,19 @@ extension Array where Element == SemiRing { } extension SemiRing { + /// The plain text description of this instance with score details. var shortDescription: String { "(\(logp), \(logr))" } } -/// SE-0259-esque equality with tolerance extension SemiRing { + /// Returns true when `self` is within `tolerance` of `other`. This behavior + /// is modeled after SE-0259. + /// + /// - Parameter other: the instance to be compared with `self`. + /// - Parameter tolerance: the amount of variability considered acceptable + /// in determining equality. // TODO(abdulras) see if we can use ulp as a default tolerance @inlinable public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { From 8c3812edda50e5ae113bfcc96baf892a5d496994 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 28 May 2020 22:33:51 +0000 Subject: [PATCH 06/30] Add SNLM --- Models/Text/WordSeg/Model.swift | 84 +++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 15 deletions(-) diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift index 94b08ddc714..f36429cfcc1 100644 --- a/Models/Text/WordSeg/Model.swift +++ b/Models/Text/WordSeg/Model.swift @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + // Original Paper: // "Learning to Discover, Ground, and Use Words with Segmental Neural Language // Models" @@ -18,22 +19,29 @@ // https://www.aclweb.org/anthology/P19-1645.pdf // This implementation is not affiliated with DeepMind and has not been // verified by the authors. + import ModelSupport import TensorFlow -/// SNLM -/// -/// A representation of the Segmental Neural Language Model. -/// -/// \ref https://www.aclweb.org/anthology/P19-1645.pdf +/// A Segmental Neural Language Model for word segmentation, as described in +/// the above paper. public struct SNLM: EuclideanDifferentiable, KeyPathIterable { + /// A set of configuration parameters that define model behavior. public struct Parameters { + /// The hidden unit size. public var ndim: Int + /// The dropout rate. public var dropoutProb: Double + /// The character vocabulary. public var chrVocab: Alphabet + /// The string vocabulary. public var strVocab: Lexicon + /// The power of the length penalty. public var order: Int + /// Creates an instance with `ndim` hidden units, `dropoutProb` dropout + /// rate, `chrVocab` alphabet, `strVocab` lexicon, and `order` power of + /// length penalty. public init( ndim: Int, dropoutProb: Double, @@ -49,27 +57,40 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } } + /// The configuration parameters that define model behavior. @noDerivative public var parameters: Parameters // MARK: - Encoder + /// The embedding layer for the encoder. public var encoderEmbedding: Embedding + /// The LSTM layer for the encoder. public var encoderLSTM: LSTM // MARK: - Interpolation weight + /// The interpolation weight, which determines the proportion of + /// contributions from the lexical memory and character generation. public var mlpInterpolation: MLP // MARK: - Lexical memory + /// The lexical memory. public var mlpMemory: MLP // MARK: - Character-level decoder + /// The embedding layer for the decoder. public var decoderEmbedding: Embedding + /// The LSTM layer for the decoder. public var decoderLSTM: LSTM + /// The dense layer for the decoder. public var decoderDense: Dense // MARK: - Other layers + /// The dropout layer for both the encoder and decoder. public var dropout: Dropout // MARK: - Initializer + /// Creates an instance with the configuration defined by `parameters`. + /// + /// - Parameter parameters: the model configuration. public init(parameters: Parameters) { self.parameters = parameters @@ -113,7 +134,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - Encode - /// Returns the hidden states of the encoder LSTM applied to the given sentence. + /// Returns the hidden states of the encoder LSTM applied to `x`. + /// + /// - Parameter x: the character sequence to encode. public func encode(_ x: CharacterSequence) -> [Tensor] { var embedded = encoderEmbedding(x.tensor) embedded = dropout(embedded) @@ -125,7 +148,10 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - Decode - /// Returns log probabilities for each of the candidates. + /// Returns the log probabilities for each of the candidates. + /// + /// - Parameter candidates: the character sequences to decode. + /// - Parameter state: the hidden state from the encoder LSTM. public func decode(_ candidates: [CharacterSequence], _ state: Tensor) -> Tensor { // TODO(TF-433): Remove closure workaround when autodiff supports non-active rethrowing // functions (`Array.map`). @@ -192,6 +218,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - buildLattice + /// Returns the log likelihood for `candidate` from the lexical memory + /// `logp_lex`. + /// + /// - Parameter logp_lex: all log likelihoods in the lexical memory. + /// - Parameter candidate: the character sequence for which to retrieve the + /// log likelihood. func get_logp_lex(_ logp_lex: Tensor, _ candidate: CharacterSequence) -> Tensor { guard let index = parameters.strVocab.dictionary[candidate] else { return Tensor(-Float.infinity) @@ -199,6 +231,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { return logp_lex[Int(index)] } + /// Returns a complete lattice for `sentence` with a maximum length of + /// `maxLen`. + /// + /// - Parameter sentence: the character sequence used for determining + /// segmentation. + /// - Parameter maxLen: the maximum allowable sequence length. @differentiable public func buildLattice(_ sentence: CharacterSequence, maxLen: Int) -> Lattice { var lattice = Lattice(count: sentence.count) @@ -265,15 +303,17 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } extension Array { - // NOTE(TF-1277): this mutating method exists as a workaround for `Array.subscript._modify` not - // being differentiable. - // - // Semantically, it behaves like `Array.subscript.set`. + /// Sets the `index`th element of `self` to `value`. Semantically, it + /// behaves like `Array.subscript.set`. + /// + /// - Note: this mutating method exists as a workaround for + /// `Array.subscript._modify` not being differentiable (TF-1277). @inlinable mutating func update(at index: Int, to value: Element) { self[index] = value } + /// Returns the value and pullback of `self.update`. @usableFromInline @derivative(of: update) mutating func vjpUpdate(at index: Int, to value: Element) -> ( @@ -290,17 +330,29 @@ extension Array { } } +/// A multilayer perceptron with three layers. public struct MLP: Layer { + /// The first dense layer. public var dense1: Dense + /// The dropout layer. public var dropout: Dropout + /// The second dense layer. public var dense2: Dense + /// Creates an instance with input size `nIn`, `nHidden` hidden units, + /// dropout probability `dropoutProbability` and output size `nOut`. + /// + /// - Parameter nIn: input size. + /// - Parameter nHidden: number of hidden units. + /// - Parameter nOut: output size. + /// - Parameter dropoutProbability: probability that an input is dropped. public init(nIn: Int, nHidden: Int, nOut: Int, dropoutProbability: Double) { dense1 = Dense(inputSize: nIn, outputSize: nHidden, activation: tanh) dropout = Dropout(probability: dropoutProbability) dense2 = Dense(inputSize: nHidden, outputSize: nOut, activation: logSoftmax) } + /// Returns the result of applying all three layers in sequence to `input`. @differentiable public func callAsFunction(_ input: Tensor) -> Tensor { return dense2(dropout(dense1(input))) @@ -308,10 +360,11 @@ public struct MLP: Layer { } extension Tensor { - // NOTE(TF-1008): this is a workaround for TF-1008 that is needed for differentiation - // correctness. - // - // Remove this when differentiation uses per-instance zeros + /// Returns `self`. + /// + /// - Note: this is a workaround for TF-1008 that is needed for + /// differentiation correctness. + // TODO: Remove this when differentiation uses per-instance zeros // (`Differentiable.zeroTangentVectorInitializer`) instead of static zeros // (`AdditiveArithmetic.zero`). @differentiable(where Scalar: TensorFlowFloatingPoint) @@ -319,6 +372,7 @@ extension Tensor { self } + /// Returns the value and pullback of `self.identityADHack`. @derivative(of: identityADHack) func vjpIdentityADHack() -> ( value: Tensor, pullback: (Tensor) -> Tensor From d36a20c39e659d37190d97063b9145d3012b0308 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 28 May 2020 23:57:59 +0000 Subject: [PATCH 07/30] Add bullets for throws MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace various verbs with “returns” --- Datasets/WordSeg/WordSegDataset.swift | 19 ++++++++++++++++--- Support/Text/WordSeg/CharacterSequence.swift | 2 ++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 93476f74488..5e71ac18a13 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -46,6 +46,8 @@ public struct WordSegDataset { /// Returns a list of records parsed from `data` in UTF8. /// /// - Parameter data: text in UTF8 format. + /// + /// - Throws: An error of type 'CharacterErrors'. private static func load(data: Data) throws -> [String] { guard let contents: String = String(data: data, encoding: .utf8) else { throw CharacterErrors.nonUtf8Data @@ -53,8 +55,9 @@ public struct WordSegDataset { return load(contents: contents) } - /// Separates `contents` into a collection of strings by newlines, trimming - /// leading and trailing whitespace and excluding blank lines. + /// Returns a collection of strings created by separating `contents` by + /// newlines, trimming leading and trailing whitespace, and excluding blank + /// lines. /// /// - Parameter contents: text to be separated by newline. private static func load(contents: String) -> [String] { @@ -102,10 +105,12 @@ public struct WordSegDataset { return Alphabet(sorted, eos: eos, eow: eow, pad: pad) } - /// Creates a collection of records to be used with the WordSeg model. + /// Returns a collection of records to be used with the WordSeg model. /// /// - Parameter dataset: text to be converted. /// - Parameter alphabet: set of all characters used in `dataset`. + /// + /// - Throws: An error of type 'CharacterErrors'. private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws -> [WordSegRecord] { @@ -123,6 +128,8 @@ public struct WordSegDataset { /// /// - Parameter dataset: text to be converted. /// - Parameter alphabet: set of all characters used in `dataset`. + /// + /// - Throws: An error of type 'CharacterErrors'. private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws -> [WordSegRecord]? { @@ -135,6 +142,8 @@ public struct WordSegDataset { /// Creates an instance containing `WordSegRecords` from the default /// location. + /// + /// - Throws: An error of type 'CharacterErrors'. public init() throws { let downloadDetails = DownloadDetails() let localStorageDirectory: URL = FileManager.default.temporaryDirectory @@ -165,6 +174,8 @@ public struct WordSegDataset { /// - Parameter training: path to the file containing training data. /// - Parameter validation: path to the file containing validation data. /// - Parameter testing: path to the file containing test data. + /// + /// - Throws: An error of type 'CharacterErrors'. public init( training trainingFile: String, validation validationFile: String? = nil, @@ -202,6 +213,8 @@ public struct WordSegDataset { /// - Parameter training: contents of the training data. /// - Parameter validation: contents of the validation data. /// - Parameter testing: contents of the test data. + /// + /// - Throws: An error of type 'CharacterErrors'. public init( training trainingData: Data, validation validationData: Data?, testing testingData: Data? ) diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index 2eb4f6714bb..b8df75fd8ea 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -33,6 +33,8 @@ public struct CharacterSequence: Hashable { /// - Parameter alphabet: character to integer mapping. /// - Parameter appendingEoSTo: string to be converted to a sequence of /// integers. + /// + /// - Throws: An error of type 'CharacterErrors'. public init(alphabet: Alphabet, appendingEoSTo string: String) throws { var characters = [Int32]() characters.reserveCapacity(string.count + 1) From 951a98b96f060690a98eeb09d0341d81f3ee1069 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 00:48:22 +0000 Subject: [PATCH 08/30] Rename WordSegRecord to Phrase Rename DownloadDetails to ReferenceArchive Combine URL with filename and extension Update main summary in WordSegDataset Add blank line before doc comments in WordSegDataset --- .../{WordSegRecord.swift => Phrase.swift} | 4 +- Datasets/WordSeg/WordSegDataset.swift | 93 +++++++++++-------- Examples/WordSeg/main.swift | 16 ++-- 3 files changed, 65 insertions(+), 48 deletions(-) rename Datasets/WordSeg/{WordSegRecord.swift => Phrase.swift} (97%) diff --git a/Datasets/WordSeg/WordSegRecord.swift b/Datasets/WordSeg/Phrase.swift similarity index 97% rename from Datasets/WordSeg/WordSegRecord.swift rename to Datasets/WordSeg/Phrase.swift index 8445477a1eb..46da27bb15b 100644 --- a/Datasets/WordSeg/WordSegRecord.swift +++ b/Datasets/WordSeg/Phrase.swift @@ -15,9 +15,11 @@ import ModelSupport /// A sequence of text for use in word segmentation. -public struct WordSegRecord { +public struct Phrase { + /// A raw, unprocessed sequence of text. public let plainText: String + /// A sequence of text in numeric form, derived from `plainText`. public let numericalizedText: CharacterSequence diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 131be208958..4ff8ad1521e 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -15,30 +15,38 @@ import Foundation import ModelSupport -/// A collection of raw and processed text used for training and validation -/// of word segmentation models. +/// A dataset targeted at the problem of word segmentation. +/// +/// The reference archive was published in the paper "Learning to Discover, +/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya +/// Kawakami, Chris Dyer, and Phil Blunsom: +/// https://www.aclweb.org/anthology/P19-1645.pdf. public struct WordSegDataset { - /// A collection of text used for training. - public let training: [WordSegRecord] - /// A collection of text used for testing. - public private(set) var testing: [WordSegRecord]? - /// A collection of text used for validation. - public private(set) var validation: [WordSegRecord]? + + /// The text used for training. + public let trainingPhrases: [Phrase] + + /// The text used for testing. + public private(set) var testingPhrases: [Phrase]? + + /// The text used for validation. + public private(set) var validationPhrases: [Phrase]? + /// The set of characters found in all included texts. public let alphabet: Alphabet /// Details used for downloading source data. - private struct DownloadDetails { + private struct ReferenceArchive { + /// The location of the archive. - var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")! - /// The basename of the archive. - var archiveFileName = "seg" - /// The extension of the archive. - var archiveExtension = "zip" + var location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! + /// The path to the test source. var testingFilePath = "br/br-text/te.txt" + /// The path to the training source. var trainingFilePath = "br/br-text/tr.txt" + /// The path to the validation source. var validationFilePath = "br/br-text/va.txt" } @@ -71,7 +79,7 @@ public struct WordSegDataset { return strings } - /// Returns an alphabet composed of all characters found in `training` and + /// Returns an alphabet composed of all characters found in `trainingPhrases` and /// `otherSequences`. /// /// - Parameter training: full text of the training data. @@ -112,11 +120,11 @@ public struct WordSegDataset { /// /// - Throws: An error of type 'CharacterErrors'. private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws - -> [WordSegRecord] + -> [Phrase] { return try dataset.map { let trimmed = $0.components(separatedBy: .whitespaces).joined() - return try WordSegRecord( + return try Phrase( plainText: $0, numericalizedText: CharacterSequence( alphabet: alphabet, appendingEoSTo: trimmed)) @@ -131,45 +139,48 @@ public struct WordSegDataset { /// /// - Throws: An error of type 'CharacterErrors'. private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws - -> [WordSegRecord]? + -> [Phrase]? { if let ds = dataset { - let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function + let tmp: [Phrase] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function return tmp } return nil } - /// Creates an instance containing `WordSegRecords` from the default + /// Creates an instance containing `Phrase`s from the default /// location. /// /// - Throws: An error of type 'CharacterErrors'. public init() throws { - let downloadDetails = DownloadDetails() + let referenceArchive = ReferenceArchive() let localStorageDirectory: URL = DatasetUtilities.defaultDirectory .appendingPathComponent("WordSeg", isDirectory: true) - WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails) + WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, referenceArchive: referenceArchive) + let archiveFileName = + referenceArchive + .location.deletingPathExtension().lastPathComponent let archiveDirectory = localStorageDirectory - .appendingPathComponent(downloadDetails.archiveFileName) + .appendingPathComponent(archiveFileName) let trainingFilePath = archiveDirectory - .appendingPathComponent(downloadDetails.trainingFilePath).path + .appendingPathComponent(referenceArchive.trainingFilePath).path let validationFilePath = archiveDirectory - .appendingPathComponent(downloadDetails.validationFilePath).path + .appendingPathComponent(referenceArchive.validationFilePath).path let testingFilePath = archiveDirectory - .appendingPathComponent(downloadDetails.testingFilePath).path + .appendingPathComponent(referenceArchive.testingFilePath).path try self.init( training: trainingFilePath, validation: validationFilePath, testing: testingFilePath) } - /// Creates an instance containing `WordSegRecords` from the given files. + /// Creates an instance containing `Phrase`s from the given files. /// /// - Parameter training: path to the file containing training data. /// - Parameter validation: path to the file containing validation data. @@ -203,12 +214,12 @@ public struct WordSegDataset { testing = try Self.load(data: data) } self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) - self.training = try Self.convertDataset(training, alphabet: self.alphabet) - self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) - self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) + self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet) + self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet) + self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet) } - /// Creates an instance containing `WordSegRecords` from the given data. + /// Creates an instance containing `Phrase`s from the given data. /// /// - Parameter training: contents of the training data. /// - Parameter validation: contents of the validation data. @@ -231,17 +242,17 @@ public struct WordSegDataset { } self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) - self.training = try Self.convertDataset(training, alphabet: self.alphabet) - self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) - self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) + self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet) + self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet) + self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet) } /// Downloads and unpacks the source archive if it does not exist locally. /// /// - Parameter directory: local directory to store files. - /// - Parameter downloadDetails: where to find the source archive. + /// - Parameter referenceArchive: where to find the source archive. private static func downloadIfNotPresent( - to directory: URL, downloadDetails: DownloadDetails + to directory: URL, referenceArchive: ReferenceArchive ) { let downloadPath = directory.path let directoryExists = FileManager.default.fileExists(atPath: downloadPath) @@ -250,11 +261,15 @@ public struct WordSegDataset { guard !directoryExists || directoryEmpty else { return } + let remoteRoot = referenceArchive.location.deletingLastPathComponent() + let filename = referenceArchive.location.deletingPathExtension().lastPathComponent + let fileExtension = referenceArchive.location.pathExtension + // Downloads and extracts dataset files. let _ = DatasetUtilities.downloadResource( - filename: downloadDetails.archiveFileName, - fileExtension: downloadDetails.archiveExtension, - remoteRoot: downloadDetails.archiveLocation, + filename: filename, + fileExtension: fileExtension, + remoteRoot: remoteRoot, localStorageDirectory: directory, extract: true) } } diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift index 98ca33e2104..a09db0ce51f 100644 --- a/Examples/WordSeg/main.swift +++ b/Examples/WordSeg/main.swift @@ -50,7 +50,7 @@ default: usage() } -let sequences = dataset.training.map { $0.numericalizedText } +let sequences = dataset.trainingPhrases.map { $0.numericalizedText } let lexicon = Lexicon( from: sequences, alphabet: dataset.alphabet, @@ -76,8 +76,8 @@ for epoch in 1...maxEpochs { Context.local.learningPhase = .training var trainingLossSum: Float = 0 var trainingBatchCount = 0 - for record in dataset.training { - let sentence = record.numericalizedText + for phrase in dataset.trainingPhrases { + let sentence = phrase.numericalizedText let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor in let lattice = model.buildLattice(sentence, maxLen: maxLength) let score = lattice[sentence.count].semiringScore @@ -103,7 +103,7 @@ for epoch in 1...maxEpochs { trainingLossHistory.append(trainingLoss) reduceLROnPlateau(lossHistory: trainingLossHistory, optimizer: optimizer) - guard let validationDataset = dataset.validation else { + guard let validationPhrases = dataset.validationPhrases else { print( """ [Epoch \(epoch)] \ @@ -127,8 +127,8 @@ for epoch in 1...maxEpochs { var validationBatchCount = 0 var validationCharacterCount = 0 var validationPlainText: String = "" - for record in validationDataset { - let sentence = record.numericalizedText + for phrase in validationPhrases { + let sentence = phrase.numericalizedText var lattice = model.buildLattice(sentence, maxLen: maxLength) let score = lattice[sentence.count].semiringScore @@ -137,8 +137,8 @@ for epoch in 1...maxEpochs { validationCharacterCount += sentence.count // View a sample segmentation once per epoch. - if validationBatchCount == validationDataset.count { - let bestPath = lattice.viterbi(sentence: record.numericalizedText) + if validationBatchCount == validationPhrases.count { + let bestPath = lattice.viterbi(sentence: phrase.numericalizedText) validationPlainText = Lattice.pathToPlainText(path: bestPath, alphabet: dataset.alphabet) } } From 4de6e7d8eba9b8f22629ca5e1b3c903027ab2180 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 01:00:05 +0000 Subject: [PATCH 09/30] Update CMakeLists --- Datasets/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Datasets/CMakeLists.txt b/Datasets/CMakeLists.txt index 3f1b519cb7d..b2344d69e4c 100644 --- a/Datasets/CMakeLists.txt +++ b/Datasets/CMakeLists.txt @@ -21,7 +21,7 @@ add_library(Datasets TensorPair.swift TextUnsupervised/TextUnsupervised.swift WordSeg/WordSegDataset.swift - WordSeg/WordSegRecord.swift + WordSeg/Phrase.swift ImageSegmentationDataset.swift OxfordIIITPets/OxfordIIITPets.swift) target_link_libraries(Datasets PUBLIC From 060e88b7d62f08c5b9263e9a40460575d27d3040 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 02:38:16 +0000 Subject: [PATCH 10/30] Clarify more summaries. Remove explicit parameter descriptions and add them to summaries. Handle errors instead of throwing. Remove CharacterErrors.nonUtf8Data. Update attribute names in dataset tests. --- Datasets/WordSeg/WordSegDataset.swift | 143 ++++++++---------- Support/Text/WordSeg/CharacterSequence.swift | 3 - .../WordSeg/WordSegDatasetTests.swift | 16 +- 3 files changed, 67 insertions(+), 95 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 4ff8ad1521e..119b047d58a 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -32,10 +32,10 @@ public struct WordSegDataset { /// The text used for validation. public private(set) var validationPhrases: [Phrase]? - /// The set of characters found in all included texts. + /// The union of all characters in the included dataset. public let alphabet: Alphabet - /// Details used for downloading source data. + /// A pointer to source data. private struct ReferenceArchive { /// The location of the archive. @@ -51,23 +51,15 @@ public struct WordSegDataset { var validationFilePath = "br/br-text/va.txt" } - /// Returns a list of records parsed from `data` in UTF8. - /// - /// - Parameter data: text in UTF8 format. - /// - /// - Throws: An error of type 'CharacterErrors'. - private static func load(data: Data) throws -> [String] { + /// Returns the text of all phrases parsed from `data` in UTF8. + private static func load(data: Data) -> [String] { guard let contents: String = String(data: data, encoding: .utf8) else { - throw CharacterErrors.nonUtf8Data + return [] } return load(contents: contents) } - /// Returns a collection of strings created by separating `contents` by - /// newlines, trimming leading and trailing whitespace, and excluding blank - /// lines. - /// - /// - Parameter contents: text to be separated by newline. + /// Returns the text of all phrases from `contents`. private static func load(contents: String) -> [String] { var strings = [String]() @@ -79,12 +71,8 @@ public struct WordSegDataset { return strings } - /// Returns an alphabet composed of all characters found in `trainingPhrases` and - /// `otherSequences`. + /// Returns the union of all characters in `training` and `otherSequences`. /// - /// - Parameter training: full text of the training data. - /// - Parameter otherSequences: optional full text of the validation and - /// test data. /// - Parameter eos: text to be used as the end of sequence marker. /// - Parameter eow: text to be used as the end of word marker. /// - Parameter pad: text to be used as the padding marker. @@ -113,45 +101,33 @@ public struct WordSegDataset { return Alphabet(sorted, eos: eos, eow: eow, pad: pad) } - /// Returns a collection of records to be used with the WordSeg model. - /// - /// - Parameter dataset: text to be converted. - /// - Parameter alphabet: set of all characters used in `dataset`. + /// Returns phrases from `dataset`, using `alphabet`, to be used with the + /// WordSeg model. /// - /// - Throws: An error of type 'CharacterErrors'. - private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws + /// - Note: Omits any part of the dataset that cannot be converted to + /// `CharacterSequence`. + private static func convertDataset(_ dataset: [String], alphabet: Alphabet) -> [Phrase] { - return try dataset.map { - let trimmed = $0.components(separatedBy: .whitespaces).joined() - return try Phrase( - plainText: $0, - numericalizedText: CharacterSequence( - alphabet: alphabet, appendingEoSTo: trimmed)) + var phrases = [Phrase]() + + for data in dataset { + let trimmed = data.components(separatedBy: .whitespaces).joined() + guard let numericalizedText = try? CharacterSequence( + alphabet: alphabet, appendingEoSTo: trimmed) else { continue } + let phrase = Phrase( + plainText: data, + numericalizedText: numericalizedText) + phrases.append(phrase) } - } - /// Returns a collection of records to be used with the WordSeg model, or - /// `nil` if `dataset` is empty. - /// - /// - Parameter dataset: text to be converted. - /// - Parameter alphabet: set of all characters used in `dataset`. - /// - /// - Throws: An error of type 'CharacterErrors'. - private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws - -> [Phrase]? - { - if let ds = dataset { - let tmp: [Phrase] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function - return tmp - } - return nil + return phrases } - /// Creates an instance containing `Phrase`s from the default - /// location. + /// Creates an instance containing phrases from the default location. /// - /// - Throws: An error of type 'CharacterErrors'. + /// - Throws: an error in the Cocoa domain, if the default training file + /// cannot be read. public init() throws { let referenceArchive = ReferenceArchive() let localStorageDirectory: URL = DatasetUtilities.defaultDirectory @@ -180,13 +156,11 @@ public struct WordSegDataset { testing: testingFilePath) } - /// Creates an instance containing `Phrase`s from the given files. + /// Creates an instance containing phrases from `trainingFile`, and + /// optionally `validationFile` and `testingFile`. /// - /// - Parameter training: path to the file containing training data. - /// - Parameter validation: path to the file containing validation data. - /// - Parameter testing: path to the file containing test data. - /// - /// - Throws: An error of type 'CharacterErrors'. + /// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be + /// read. public init( training trainingFile: String, validation validationFile: String? = nil, @@ -195,62 +169,63 @@ public struct WordSegDataset { let trainingData = try Data( contentsOf: URL(fileURLWithPath: trainingFile), options: .alwaysMapped) - let training = try Self.load(data: trainingData) + let training = Self.load(data: trainingData) - var validation: [String]? = nil - var testing: [String]? = nil + let validation: [String] + let testing: [String] if let validationFile = validationFile { let data = try Data( contentsOf: URL(fileURLWithPath: validationFile), options: .alwaysMapped) - validation = try Self.load(data: data) + validation = Self.load(data: data) + } else { + validation = [String]() } if let testingFile = testingFile { let data: Data = try Data( contentsOf: URL(fileURLWithPath: testingFile), options: .alwaysMapped) - testing = try Self.load(data: data) + testing = Self.load(data: data) + } else { + testing = [String]() } + self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) - self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet) - self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet) - self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet) + self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) + self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) + self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) } - /// Creates an instance containing `Phrase`s from the given data. - /// - /// - Parameter training: contents of the training data. - /// - Parameter validation: contents of the validation data. - /// - Parameter testing: contents of the test data. - /// - /// - Throws: An error of type 'CharacterErrors'. + /// Creates an instance containing phrases from `trainingData`, and + /// optionally `validationData` and `testingData`. public init( training trainingData: Data, validation validationData: Data?, testing testingData: Data? ) - throws { - let training = try Self.load(data: trainingData) - var validation: [String]? = nil - var testing: [String]? = nil + let training = Self.load(data: trainingData) + let validation: [String] + let testing: [String] if let validationData = validationData { - validation = try Self.load(data: validationData) + validation = Self.load(data: validationData) + } else { + validation = [String]() } if let testingData = testingData { - testing = try Self.load(data: testingData) + testing = Self.load(data: testingData) + } else { + testing = [String]() } self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) - self.trainingPhrases = try Self.convertDataset(training, alphabet: self.alphabet) - self.validationPhrases = try Self.convertDataset(validation, alphabet: self.alphabet) - self.testingPhrases = try Self.convertDataset(testing, alphabet: self.alphabet) + self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) + self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) + self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) } - /// Downloads and unpacks the source archive if it does not exist locally. - /// - /// - Parameter directory: local directory to store files. - /// - Parameter referenceArchive: where to find the source archive. + /// Downloads and unpacks `referenceArchive` to `directory` if it does not + /// exist locally. private static func downloadIfNotPresent( to directory: URL, referenceArchive: ReferenceArchive ) { diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index b8df75fd8ea..d4c4101afba 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -100,7 +100,6 @@ extension CharacterSequence: CustomStringConvertible { /// An error that can be encountered when processing characters. public enum CharacterErrors: Error { case unknownCharacter(character: Character, index: Int, sentence: String) - case nonUtf8Data } extension CharacterErrors: CustomStringConvertible { @@ -110,8 +109,6 @@ extension CharacterErrors: CustomStringConvertible { case let .unknownCharacter(character, index, sentence): return "Unknown character '\(character)' encountered at index \(index) while converting sentence \"\(sentence)\" to a character sequence." - case .nonUtf8Data: - return "Non-UTF8 data encountered." } } } diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift index e3ea2da3d37..06f073dd542 100644 --- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift +++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift @@ -20,26 +20,26 @@ class WordSegDatasetTests: XCTestCase { func testCreateWordSegDataset() { do { let dataset = try WordSegDataset() - XCTAssertEqual(dataset.training.count, 7832) - XCTAssertEqual(dataset.validation!.count, 979) - XCTAssertEqual(dataset.testing!.count, 979) + XCTAssertEqual(dataset.trainingPhrases.count, 7832) + XCTAssertEqual(dataset.validationPhrases!.count, 979) + XCTAssertEqual(dataset.testingPhrases!.count, 979) // Check the first example in each set. let trainingExample: [Int32] = [ 26, 16, 22, 24, 2, 15, 21, 21, 16, 20, 6, 6, 21, 9, 6, 3, 16, 16, 12, 28, ] - XCTAssertEqual(dataset.training[0].numericalizedText.characters, trainingExample) + XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample) let validationExample: [Int32] = [9, 6, 13, 13, 16, 14, 10, 14, 10, 28] - XCTAssertEqual(dataset.validation![0].numericalizedText.characters, validationExample) + XCTAssertEqual(dataset.validationPhrases![0].numericalizedText.characters, validationExample) let testingExample: [Int32] = [ 13, 6, 21, 14, 6, 20, 6, 6, 10, 7, 10, 4, 2, 15, 20, 6, 6, 2, 15, 26, 3, 16, 5, 26, 10, 15, 21, 9, 2, 21, 14, 10, 19, 19, 16, 19, 28, ] - XCTAssertEqual(dataset.testing![0].numericalizedText.characters, testingExample) + XCTAssertEqual(dataset.testingPhrases![0].numericalizedText.characters, testingExample) } catch { XCTFail(error.localizedDescription) } @@ -57,12 +57,12 @@ class WordSegDatasetTests: XCTestCase { Data( bytesNoCopy: UnsafeMutableRawPointer(mutating: address), count: pointer.count, deallocator: .none) - dataset = try? WordSegDataset(training: training, validation: nil, testing: nil) + dataset = WordSegDataset(training: training, validation: nil, testing: nil) } // 'a', 'h', 'l', 'p', '', '', '' XCTAssertEqual(dataset?.alphabet.count, 7) - XCTAssertEqual(dataset?.training.count, 1) + XCTAssertEqual(dataset?.trainingPhrases.count, 1) } static var allTests = [ From 04c58bbc23ca29aec072e4c6571310ea2baa80ed Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Mon, 1 Jun 2020 22:42:45 -0400 Subject: [PATCH 11/30] Lint --- Datasets/WordSeg/WordSegDataset.swift | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 119b047d58a..f45a3374007 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -113,8 +113,10 @@ public struct WordSegDataset { for data in dataset { let trimmed = data.components(separatedBy: .whitespaces).joined() - guard let numericalizedText = try? CharacterSequence( - alphabet: alphabet, appendingEoSTo: trimmed) else { continue } + guard + let numericalizedText = try? CharacterSequence( + alphabet: alphabet, appendingEoSTo: trimmed) + else { continue } let phrase = Phrase( plainText: data, numericalizedText: numericalizedText) @@ -133,7 +135,8 @@ public struct WordSegDataset { let localStorageDirectory: URL = DatasetUtilities.defaultDirectory .appendingPathComponent("WordSeg", isDirectory: true) - WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, referenceArchive: referenceArchive) + WordSegDataset.downloadIfNotPresent( + to: localStorageDirectory, referenceArchive: referenceArchive) let archiveFileName = referenceArchive @@ -202,8 +205,7 @@ public struct WordSegDataset { /// optionally `validationData` and `testingData`. public init( training trainingData: Data, validation validationData: Data?, testing testingData: Data? - ) - { + ) { let training = Self.load(data: trainingData) let validation: [String] let testing: [String] From 079c47f95c2470f4bada9c0e49b864405fc05b9f Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 15:51:42 +0000 Subject: [PATCH 12/30] Add blank lines Update summary in Phrase to include parameter names. --- Datasets/WordSeg/Phrase.swift | 7 ++---- Models/Text/WordSeg/Lattice.swift | 16 ++++++++++++++ Models/Text/WordSeg/Model.swift | 23 ++++++++++++++++++++ Models/Text/WordSeg/SemiRing.swift | 6 +++++ Support/Text/WordSeg/Alphabet.swift | 3 +++ Support/Text/WordSeg/CharacterSequence.swift | 6 +++++ Support/Text/WordSeg/Lexicon.swift | 1 + 7 files changed, 57 insertions(+), 5 deletions(-) diff --git a/Datasets/WordSeg/Phrase.swift b/Datasets/WordSeg/Phrase.swift index 46da27bb15b..e2ccb02fad1 100644 --- a/Datasets/WordSeg/Phrase.swift +++ b/Datasets/WordSeg/Phrase.swift @@ -23,11 +23,8 @@ public struct Phrase { /// A sequence of text in numeric form, derived from `plainText`. public let numericalizedText: CharacterSequence - /// Creates an instance containing both raw and processed forms of a - /// sequence of text. - /// - /// - Parameter plainText: raw, unprocessed text. - /// - Parameter numericalizedText: processed text in numeric form. + /// Creates an instance containing both raw (`plainText`) and processed + /// (`numericalizedText`) forms of a sequence of text. public init(plainText: String, numericalizedText: CharacterSequence) { self.plainText = plainText self.numericalizedText = numericalizedText diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift index cda03ba5218..ddd5168883d 100644 --- a/Models/Text/WordSeg/Lattice.swift +++ b/Models/Text/WordSeg/Lattice.swift @@ -27,22 +27,28 @@ import TensorFlow /// sequence. The path with the best score provides the most likely /// segmentation at inference. public struct Lattice: Differentiable { + /// Represents a word. /// /// At each character position, an edge is constructed for every possible /// segmentation of the preceding portion of the sequence. public struct Edge: Differentiable { + /// The node position immediately preceding this edge. @noDerivative public var start: Int + /// The node position immediately following this edge. @noDerivative public var end: Int + /// The characters composing a word. @noDerivative public var string: CharacterSequence + /// The log likelihood of this segmentation. public var logp: Tensor /// The expected score for this segmentation. public var score: SemiRing + /// The expected total score for this segmentation. public var totalScore: SemiRing @@ -105,12 +111,16 @@ public struct Lattice: Differentiable { /// - Note: Scores are only meaningful in relation to incoming edges and the /// start node has no incoming edges. public struct Node: Differentiable { + /// The incoming edge with the highest score. @noDerivative public var bestEdge: Edge? + /// The score of the best incoming edge. public var bestScore: Float = 0.0 + /// All incoming edges. public var edges = [Edge]() + /// A composite score of all incoming edges. public var semiringScore: SemiRing = SemiRing.one @@ -233,6 +243,7 @@ public struct Lattice: Differentiable { } extension Lattice: CustomStringConvertible { + /// The plain text description of this instance that describes all nodes. public var description: String { """ @@ -244,6 +255,7 @@ extension Lattice: CustomStringConvertible { } extension Lattice.Node: CustomStringConvertible { + /// The plain text description of this instance that describes all incoming /// edges. public var description: String { @@ -261,6 +273,7 @@ extension Lattice.Node: CustomStringConvertible { } extension Lattice.Edge: CustomStringConvertible { + /// The plain text description of this instance with all edge details. public var description: String { "[\(start)->\(end)] logp: \(logp), score: \(score.shortDescription), total score: \(totalScore.shortDescription), sentence: \(string)" @@ -268,6 +281,7 @@ extension Lattice.Edge: CustomStringConvertible { } extension Lattice { + /// Returns true when all nodes in `self` are within `tolerance` of all /// nodes in `other`. This behavior is modeled after SE-0259. /// @@ -292,6 +306,7 @@ extension Lattice { } extension Lattice.Node { + /// Returns true when all properties and edges in `self` are within /// `tolerance` of all properties and edges in `other`. This behavior is /// modeled after SE-0259. @@ -321,6 +336,7 @@ extension Lattice.Node { } extension Lattice.Edge { + /// Returns true when the log likelihood and scores in `self` are within /// `tolerance` of the log likelihood and scores in `other`. This behavior /// is modeled after SE-0259. diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift index f36429cfcc1..b5c341f97f0 100644 --- a/Models/Text/WordSeg/Model.swift +++ b/Models/Text/WordSeg/Model.swift @@ -26,16 +26,22 @@ import TensorFlow /// A Segmental Neural Language Model for word segmentation, as described in /// the above paper. public struct SNLM: EuclideanDifferentiable, KeyPathIterable { + /// A set of configuration parameters that define model behavior. public struct Parameters { + /// The hidden unit size. public var ndim: Int + /// The dropout rate. public var dropoutProb: Double + /// The character vocabulary. public var chrVocab: Alphabet + /// The string vocabulary. public var strVocab: Lexicon + /// The power of the length penalty. public var order: Int @@ -61,33 +67,42 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { @noDerivative public var parameters: Parameters // MARK: - Encoder + /// The embedding layer for the encoder. public var encoderEmbedding: Embedding + /// The LSTM layer for the encoder. public var encoderLSTM: LSTM // MARK: - Interpolation weight + /// The interpolation weight, which determines the proportion of /// contributions from the lexical memory and character generation. public var mlpInterpolation: MLP // MARK: - Lexical memory + /// The lexical memory. public var mlpMemory: MLP // MARK: - Character-level decoder + /// The embedding layer for the decoder. public var decoderEmbedding: Embedding + /// The LSTM layer for the decoder. public var decoderLSTM: LSTM + /// The dense layer for the decoder. public var decoderDense: Dense // MARK: - Other layers + /// The dropout layer for both the encoder and decoder. public var dropout: Dropout // MARK: - Initializer + /// Creates an instance with the configuration defined by `parameters`. /// /// - Parameter parameters: the model configuration. @@ -134,6 +149,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - Encode + /// Returns the hidden states of the encoder LSTM applied to `x`. /// /// - Parameter x: the character sequence to encode. @@ -148,6 +164,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - Decode + /// Returns the log probabilities for each of the candidates. /// /// - Parameter candidates: the character sequences to decode. @@ -218,6 +235,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } // MARK: - buildLattice + /// Returns the log likelihood for `candidate` from the lexical memory /// `logp_lex`. /// @@ -303,6 +321,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { } extension Array { + /// Sets the `index`th element of `self` to `value`. Semantically, it /// behaves like `Array.subscript.set`. /// @@ -332,10 +351,13 @@ extension Array { /// A multilayer perceptron with three layers. public struct MLP: Layer { + /// The first dense layer. public var dense1: Dense + /// The dropout layer. public var dropout: Dropout + /// The second dense layer. public var dense2: Dense @@ -360,6 +382,7 @@ public struct MLP: Layer { } extension Tensor { + /// Returns `self`. /// /// - Note: this is a workaround for TF-1008 that is needed for diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift index c6854670036..0c7809b10c9 100644 --- a/Models/Text/WordSeg/SemiRing.swift +++ b/Models/Text/WordSeg/SemiRing.swift @@ -41,8 +41,10 @@ public func logSumExp(_ lhs: Tensor, _ rhs: Tensor) -> Tensor + /// The regularization factor. public var logr: Tensor @@ -64,6 +66,7 @@ public struct SemiRing: Differentiable { /// The baseline score of zero. static var zero: SemiRing { SemiRing(logp: -Float.infinity, logr: -Float.infinity) } + /// The baseline score of one. static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) } } @@ -86,6 +89,7 @@ func + (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { } extension Array where Element == SemiRing { + /// Returns a sum of all scores in the collection. @differentiable func sum() -> SemiRing { @@ -96,6 +100,7 @@ extension Array where Element == SemiRing { } extension SemiRing { + /// The plain text description of this instance with score details. var shortDescription: String { "(\(logp), \(logr))" @@ -103,6 +108,7 @@ extension SemiRing { } extension SemiRing { + /// Returns true when `self` is within `tolerance` of `other`. This behavior /// is modeled after SE-0259. /// diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift index 06add8d981f..ac5eb66aee4 100644 --- a/Support/Text/WordSeg/Alphabet.swift +++ b/Support/Text/WordSeg/Alphabet.swift @@ -20,6 +20,7 @@ import TensorFlow /// /// - Note: We map from String in order to support multi-character metadata sequences such as ``. public struct Alphabet { + /// A type whose instances represent a character. public typealias Element = String @@ -28,8 +29,10 @@ public struct Alphabet { /// A marker denoting the end of a sequence. public let eos: Int32 + /// A marker denoting the end of a word. public let eow: Int32 + /// A marker used for padding inside a sequence. public let pad: Int32 diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index d4c4101afba..5dcc08a0147 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -16,8 +16,10 @@ import TensorFlow /// A sequence of characters represented by integers. public struct CharacterSequence: Hashable { + /// A collection of integers representing a sequence of characters. public let characters: [Int32] + /// A marker denoting the end of the sequence. private let eos: Int32 @@ -82,8 +84,10 @@ public struct CharacterSequence: Hashable { /// Count of characters in the sequence, including the end marker. public var count: Int { return characters.count } + /// The last character in the sequence, i.e. the end marker. public var last: Int32? { return characters.last } + /// TODO: what's happening here? public var tensor: Tensor { Tensor([self.eos] + characters[0.. Date: Tue, 2 Jun 2020 20:12:48 +0000 Subject: [PATCH 13/30] Clarify more summaries. Remove explicit parameter descriptions and add them to summaries. Conform parameter names to Swift conventions. --- Benchmarks/Models/WordSeg.swift | 10 +- Examples/WordSeg/main.swift | 16 +- Models/Text/WordSeg/Lattice.swift | 83 +++-------- Models/Text/WordSeg/Model.swift | 138 ++++++++---------- Models/Text/WordSeg/SemiRing.swift | 21 +-- Support/Text/WordSeg/Alphabet.swift | 16 +- Support/Text/WordSeg/CharacterSequence.swift | 33 ++--- Support/Text/WordSeg/Lexicon.swift | 25 +--- Tests/SupportTests/WordSegSupportTests.swift | 2 +- .../WordSegmentationTests/ProbeLayers.swift | 32 ++-- 10 files changed, 153 insertions(+), 223 deletions(-) diff --git a/Benchmarks/Models/WordSeg.swift b/Benchmarks/Models/WordSeg.swift index 74cf1280601..127fd918372 100644 --- a/Benchmarks/Models/WordSeg.swift +++ b/Benchmarks/Models/WordSeg.swift @@ -106,14 +106,14 @@ struct WordSegBenchmark: Benchmark { from: [sentence], alphabet: dataset.alphabet, maxLength: maximumSequenceLength, - minFreq: 10 + minFrequency: 10 ) let modelParameters = SNLM.Parameters( - ndim: 512, - dropoutProb: 0.5, - chrVocab: dataset.alphabet, - strVocab: lexicon, + hiddenSize: 512, + dropoutProbability: 0.5, + alphabet: dataset.alphabet, + lexicon: lexicon, order: 5 ) diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift index a09db0ce51f..004ba4517c9 100644 --- a/Examples/WordSeg/main.swift +++ b/Examples/WordSeg/main.swift @@ -18,9 +18,9 @@ import TensorFlow import TextModels // Model flags -let ndim = 512 // Hidden unit size. +let hiddenSize = 512 // Hidden unit size. // Training flags -let dropoutProb = 0.5 // Dropout rate. +let dropoutProbability = 0.5 // Dropout rate. let order = 5 // Power of length penalty. let maxEpochs = 1000 // Maximum number of training epochs. var trainingLossHistory = [Float]() // Keep track of loss. @@ -30,7 +30,7 @@ let learningRate: Float = 1e-3 // Initial learning rate. let lambd: Float = 0.00075 // Weight of length penalty. // Lexicon flags. let maxLength = 10 // Maximum length of a string. -let minFreq = 10 // Minimum frequency of a string. +let minFrequency = 10 // Minimum frequency of a string. // Load user-provided data files. let dataset: WordSegDataset @@ -55,14 +55,14 @@ let lexicon = Lexicon( from: sequences, alphabet: dataset.alphabet, maxLength: maxLength, - minFreq: minFreq + minFrequency: minFrequency ) let modelParameters = SNLM.Parameters( - ndim: ndim, - dropoutProb: dropoutProb, - chrVocab: dataset.alphabet, - strVocab: lexicon, + hiddenSize: hiddenSize, + dropoutProbability: dropoutProbability, + alphabet: dataset.alphabet, + lexicon: lexicon, order: order ) diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift index ddd5168883d..ccecfe53dc1 100644 --- a/Models/Text/WordSeg/Lattice.swift +++ b/Models/Text/WordSeg/Lattice.swift @@ -52,17 +52,11 @@ public struct Lattice: Differentiable { /// The expected total score for this segmentation. public var totalScore: SemiRing - /// Creates an edge for `sentence` between `start` and `end` node - /// positions. Sets the log probability to `logp` and uses this value to - /// calculate the score. Sums the score with `previous` to determine the - /// total score. + /// Creates an edge for `sentence` between `start` and `end`. /// - /// - Parameter start: the position of the start node. - /// - Parameter end: the position of the end node. - /// - Parameter sentence: the character sequence. - /// - Parameter logp: the log likelihood. - /// - Parameter previous: the total score of the preceding edge. - /// - Parameter order: the power of the length penalty. + /// Uses the log probability `logp` and the power of the length penalty + /// `order` to calculate the regularization factor and form the current + /// score. Sums this score with `previous` to determine the total score. @differentiable init( start: Int, end: Int, sentence: CharacterSequence, logp: Tensor, @@ -81,15 +75,8 @@ public struct Lattice: Differentiable { self.totalScore = self.score * previous } - /// Creates an edge for `string` between `start` and `end` node - /// positions. Sets the log probability, score, and total score. - /// - /// - Parameter start: the position of the start node. - /// - Parameter end: the position of the end node. - /// - Parameter string: the character sequence. - /// - Parameter logp: the log likelihood. - /// - Parameter score: the current score. - /// - Parameter totalScore: the total score. + /// Creates an edge for `string` between `start` and `end` and sets the + /// log probability `logp`, `score`, and `totalScore`. @differentiable public init( start: Int, end: Int, string: CharacterSequence, logp: Tensor, @@ -104,9 +91,10 @@ public struct Lattice: Differentiable { } } - /// Represents a word boundary. When a lattice is built, a start node is - /// created, followed by one for every character in the sequence, - /// representing every potential boundary. + /// Represents a word boundary. + /// + /// When a lattice is built, a start node is created, followed by one for + /// every character in the sequence, representing every potential boundary. /// /// - Note: Scores are only meaningful in relation to incoming edges and the /// start node has no incoming edges. @@ -127,13 +115,8 @@ public struct Lattice: Differentiable { /// Creates an empty instance. init() {} - /// Creates a node preceded by `bestEdge`. Stores `bestScore` and - /// `semiringScore`. Sets incoming edges to `edges`. - /// - /// - Parameter bestEdge: the best incoming edge. - /// - Parameter bestScore: the score of the best incoming edge. - /// - Parameter edges: the incoming edges. - /// - Parameter semiringScore: the composite score of all incoming edges. + /// Creates a node preceded by `bestEdge`, sets incoming edges to + /// `edges`, and stores `bestScore` and `semiringScore`. @differentiable public init( bestEdge: Edge?, bestScore: Float, edges: [Edge], @@ -145,21 +128,21 @@ public struct Lattice: Differentiable { self.semiringScore = semiringScore } - /// Calculates the semiring score by summing the total score of all edges. + /// Returns a sum of the total score of all incoming edges. @differentiable func computeSemiringScore() -> SemiRing { // TODO: Reduceinto and += edges.differentiableMap { $0.totalScore }.sum() } - /// Calculates the current semiring score and sets `semiringScore`. + /// Calculates and sets the current semiring score. @differentiable mutating func recomputeSemiringScore() { semiringScore = computeSemiringScore() } } - /// An ordered collection of nodes. + /// Represents the position of word boundaries. var positions: [Node] /// Accesses the node at the `index`th position. @@ -177,24 +160,16 @@ public struct Lattice: Differentiable { } /// Creates an empty instance with a start node, followed by `count` nodes. - /// - /// - Parameter count: the length of the lattice, e.g. number of characters - /// in the sequence. init(count: Int) { positions = Array(repeating: Node(), count: count + 1) } /// Creates an instance with the nodes in `positions`. - /// - /// - Parameter positions: the nodes composing the lattice. public init(positions: [Node]) { self.positions = positions } - /// Returns a set of edges with the best total score. Traversing this path - /// produces a segmented version of `sentence`. - /// - /// - Parameter sentence: the text to be segmented. + /// Returns the path representing the best segmentation of `sentence`. public mutating func viterbi(sentence: CharacterSequence) -> [Edge] { // Forward pass // Starts at 1 since the 0 node has no incoming edges. @@ -224,11 +199,9 @@ public struct Lattice: Differentiable { return bestPath.reversed() } - /// Returns the plain text encoded in `path`, e.g. the segmentation of the - /// full character sequence. + /// Returns the plain text encoded in `path`, using `alphabet`. /// - /// - Parameter path: a lattice path. - /// - Parameter alphabet: the alphabet used in path creation. + /// This represents the segmentation of the full character sequence. public static func pathToPlainText(path: [Edge], alphabet: Alphabet) -> String { var plainText = [String]() for edge in path { @@ -283,11 +256,9 @@ extension Lattice.Edge: CustomStringConvertible { extension Lattice { /// Returns true when all nodes in `self` are within `tolerance` of all - /// nodes in `other`. This behavior is modeled after SE-0259. + /// nodes in `other`. /// - /// - Parameter other: the instance to be compared with `self`. - /// - Parameter tolerance: the amount of variability considered acceptable - /// in determining equality. + /// - Note: This behavior is modeled after SE-0259. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { guard self.positions.count == other.positions.count else { print("positions count mismatch: \(self.positions.count) != \(other.positions.count)") @@ -308,12 +279,9 @@ extension Lattice { extension Lattice.Node { /// Returns true when all properties and edges in `self` are within - /// `tolerance` of all properties and edges in `other`. This behavior is - /// modeled after SE-0259. + /// `tolerance` of all properties and edges in `other`. /// - /// - Parameter other: the instance to be compared with `self`. - /// - Parameter tolerance: the amount of variability considered acceptable - /// in determining equality. + /// - Note: This behavior is modeled after SE-0259. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { guard self.edges.count == other.edges.count else { return false } @@ -338,12 +306,9 @@ extension Lattice.Node { extension Lattice.Edge { /// Returns true when the log likelihood and scores in `self` are within - /// `tolerance` of the log likelihood and scores in `other`. This behavior - /// is modeled after SE-0259. + /// `tolerance` of the log likelihood and scores in `other`. /// - /// - Parameter other: the instance to be compared with `self`. - /// - Parameter tolerance: the amount of variability considered acceptable - /// in determining equality. + /// - Note: This behavior is modeled after SE-0259. public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { return self.start == other.start && self.end == other.end // TODO: figure out why the string equality is being ignored diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift index b5c341f97f0..03e608be7cf 100644 --- a/Models/Text/WordSeg/Model.swift +++ b/Models/Text/WordSeg/Model.swift @@ -31,34 +31,33 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { public struct Parameters { /// The hidden unit size. - public var ndim: Int + public var hiddenSize: Int /// The dropout rate. - public var dropoutProb: Double + public var dropoutProbability: Double - /// The character vocabulary. - public var chrVocab: Alphabet + /// The union of characters used in this model. + public var alphabet: Alphabet - /// The string vocabulary. - public var strVocab: Lexicon + /// Contiguous sequences of characters encountered in the training data. + public var lexicon: Lexicon /// The power of the length penalty. public var order: Int - /// Creates an instance with `ndim` hidden units, `dropoutProb` dropout - /// rate, `chrVocab` alphabet, `strVocab` lexicon, and `order` power of - /// length penalty. + /// Creates an instance with `hiddenSize` units, `dropoutProbability` + /// rate, `alphabet`, `lexicon`, and `order` power of length penalty. public init( - ndim: Int, - dropoutProb: Double, - chrVocab: Alphabet, - strVocab: Lexicon, + hiddenSize: Int, + dropoutProbability: Double, + alphabet: Alphabet, + lexicon: Lexicon, order: Int ) { - self.ndim = ndim - self.dropoutProb = dropoutProb - self.chrVocab = chrVocab - self.strVocab = strVocab + self.hiddenSize = hiddenSize + self.dropoutProbability = dropoutProbability + self.alphabet = alphabet + self.lexicon = lexicon self.order = order } } @@ -104,55 +103,51 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { // MARK: - Initializer /// Creates an instance with the configuration defined by `parameters`. - /// - /// - Parameter parameters: the model configuration. public init(parameters: Parameters) { self.parameters = parameters // Encoder self.encoderEmbedding = Embedding( - vocabularySize: parameters.chrVocab.count, - embeddingSize: parameters.ndim) + vocabularySize: parameters.alphabet.count, + embeddingSize: parameters.hiddenSize) self.encoderLSTM = LSTM( LSTMCell( - inputSize: parameters.ndim, + inputSize: parameters.hiddenSize, hiddenSize: - parameters.ndim)) + parameters.hiddenSize)) // Interpolation weight self.mlpInterpolation = MLP( - nIn: parameters.ndim, - nHidden: parameters.ndim, - nOut: 2, - dropoutProbability: parameters.dropoutProb) + inputSize: parameters.hiddenSize, + hiddenSize: parameters.hiddenSize, + outputSize: 2, + dropoutProbability: parameters.dropoutProbability) // Lexical memory self.mlpMemory = MLP( - nIn: parameters.ndim, - nHidden: parameters.ndim, - nOut: parameters.strVocab.count, - dropoutProbability: parameters.dropoutProb) + inputSize: parameters.hiddenSize, + hiddenSize: parameters.hiddenSize, + outputSize: parameters.lexicon.count, + dropoutProbability: parameters.dropoutProbability) // Character-level decoder self.decoderEmbedding = Embedding( - vocabularySize: parameters.chrVocab.count, - embeddingSize: parameters.ndim) + vocabularySize: parameters.alphabet.count, + embeddingSize: parameters.hiddenSize) self.decoderLSTM = LSTM( LSTMCell( - inputSize: parameters.ndim, + inputSize: parameters.hiddenSize, hiddenSize: - parameters.ndim)) - self.decoderDense = Dense(inputSize: parameters.ndim, outputSize: parameters.chrVocab.count) + parameters.hiddenSize)) + self.decoderDense = Dense(inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count) // Other layers - self.dropout = Dropout(probability: parameters.dropoutProb) + self.dropout = Dropout(probability: parameters.dropoutProbability) } // MARK: - Encode /// Returns the hidden states of the encoder LSTM applied to `x`. - /// - /// - Parameter x: the character sequence to encode. public func encode(_ x: CharacterSequence) -> [Tensor] { var embedded = encoderEmbedding(x.tensor) embedded = dropout(embedded) @@ -165,10 +160,8 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { // MARK: - Decode - /// Returns the log probabilities for each of the candidates. - /// - /// - Parameter candidates: the character sequences to decode. - /// - Parameter state: the hidden state from the encoder LSTM. + /// Returns the log probabilities for each sequence in `candidates`, given + /// hidden `state` from the encoder LSTM. public func decode(_ candidates: [CharacterSequence], _ state: Tensor) -> Tensor { // TODO(TF-433): Remove closure workaround when autodiff supports non-active rethrowing // functions (`Array.map`). @@ -176,16 +169,16 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { var xBatch: [Int32] = [] var yBatch: [Int32] = [] for candidate in candidates { - let padding = Array(repeating: parameters.chrVocab.pad, count: maxLen - candidate.count - 1) + let padding = Array(repeating: parameters.alphabet.pad, count: maxLen - candidate.count - 1) // x is {sentence}{padding} - xBatch.append(parameters.chrVocab.eow) + xBatch.append(parameters.alphabet.eow) xBatch.append(contentsOf: candidate.characters) xBatch.append(contentsOf: padding) // y is {sentence}{padding} yBatch.append(contentsOf: candidate.characters) - yBatch.append(parameters.chrVocab.eow) + yBatch.append(parameters.alphabet.eow) yBatch.append(contentsOf: padding) } @@ -194,26 +187,26 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { let x: Tensor = Tensor(shape: [candidates.count, maxLen], scalars: xBatch).transposed() let y: Tensor = Tensor(shape: [candidates.count, maxLen], scalars: yBatch).transposed() - // [time x batch x ndim] + // [time x batch x hiddenSize] var embeddedX = decoderEmbedding(x) embeddedX = dropout(embeddedX) - // [batch x ndim] + // [batch x hiddenSize] let stateBatch = state.rankLifted().tiled(multiples: Tensor([Int32(candidates.count), 1])) - // [time] array of LSTM states whose `hidden` and `cell` fields have shape [batch x ndim] + // [time] array of LSTM states whose `hidden` and `cell` fields have shape [batch x hiddenSize] let decoderStates = decoderLSTM( embeddedX.unstacked(), initialState: LSTMCell.State( cell: Tensor(zeros: stateBatch.shape), hidden: stateBatch)) - // [time x batch x ndim] + // [time x batch x hiddenSize] var decoderResult = Tensor( stacking: decoderStates.differentiableMap { $0.hidden }) decoderResult = dropout(decoderResult) - // [time x batch x chrVocab.count] + // [time x batch x alphabet.count] let logits = decoderDense(decoderResult) // [time x batch] @@ -226,7 +219,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { ).reshaped(to: y.shape) // [time x batch] - let logpExcludingPad = logp * Tensor(y .!= parameters.chrVocab.pad) + let logpExcludingPad = logp * Tensor(y .!= parameters.alphabet.pad) // [batch] let candidateLogP = logpExcludingPad.transposed().sum(squeezingAxes: 1) @@ -236,25 +229,16 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { // MARK: - buildLattice - /// Returns the log likelihood for `candidate` from the lexical memory + /// Returns the log probability for `candidate` from the lexical memory /// `logp_lex`. - /// - /// - Parameter logp_lex: all log likelihoods in the lexical memory. - /// - Parameter candidate: the character sequence for which to retrieve the - /// log likelihood. func get_logp_lex(_ logp_lex: Tensor, _ candidate: CharacterSequence) -> Tensor { - guard let index = parameters.strVocab.dictionary[candidate] else { + guard let index = parameters.lexicon.dictionary[candidate] else { return Tensor(-Float.infinity) } return logp_lex[Int(index)] } - /// Returns a complete lattice for `sentence` with a maximum length of - /// `maxLen`. - /// - /// - Parameter sentence: the character sequence used for determining - /// segmentation. - /// - Parameter maxLen: the maximum allowable sequence length. + /// Returns a lattice for `sentence` with `maxLen` maximum sequence length. @differentiable public func buildLattice(_ sentence: CharacterSequence, maxLen: Int) -> Lattice { var lattice = Lattice(count: sentence.count) @@ -267,12 +251,12 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { // TODO: avoid copies? let candidate = CharacterSequence( - alphabet: parameters.chrVocab, + alphabet: parameters.alphabet, characters: sentence[pos.."] continue } @@ -282,7 +266,7 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { let current_state = states[pos] let logg = logg_batch[pos].identityADHack // [2] - let logp_lex = logp_lex_batch[pos].identityADHack // [strVocab.chr.count] + let logp_lex = logp_lex_batch[pos].identityADHack // [lexicon.chr.count] let logp_chr = decode(candidates, current_state).identityADHack // [candidates.count] if pos != 0 { // Cleanup: lattice[pos].recomputeSemiringScore() @@ -322,8 +306,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { extension Array { - /// Sets the `index`th element of `self` to `value`. Semantically, it - /// behaves like `Array.subscript.set`. + /// Sets the `index`th element of `self` to `value`. + /// + /// Semantically, this function behaves like `Array.subscript.set`. /// /// - Note: this mutating method exists as a workaround for /// `Array.subscript._modify` not being differentiable (TF-1277). @@ -361,17 +346,12 @@ public struct MLP: Layer { /// The second dense layer. public var dense2: Dense - /// Creates an instance with input size `nIn`, `nHidden` hidden units, - /// dropout probability `dropoutProbability` and output size `nOut`. - /// - /// - Parameter nIn: input size. - /// - Parameter nHidden: number of hidden units. - /// - Parameter nOut: output size. - /// - Parameter dropoutProbability: probability that an input is dropped. - public init(nIn: Int, nHidden: Int, nOut: Int, dropoutProbability: Double) { - dense1 = Dense(inputSize: nIn, outputSize: nHidden, activation: tanh) + /// Creates an instance with `inputSize`, `hiddenSize`, + /// `dropoutProbability`, and `outputSize`. + public init(inputSize: Int, hiddenSize: Int, outputSize: Int, dropoutProbability: Double) { + dense1 = Dense(inputSize: inputSize, outputSize: hiddenSize, activation: tanh) dropout = Dropout(probability: dropoutProbability) - dense2 = Dense(inputSize: nHidden, outputSize: nOut, activation: logSoftmax) + dense2 = Dense(inputSize: hiddenSize, outputSize: outputSize, activation: logSoftmax) } /// Returns the result of applying all three layers in sequence to `input`. diff --git a/Models/Text/WordSeg/SemiRing.swift b/Models/Text/WordSeg/SemiRing.swift index 0c7809b10c9..2eaa787eaac 100644 --- a/Models/Text/WordSeg/SemiRing.swift +++ b/Models/Text/WordSeg/SemiRing.swift @@ -23,7 +23,9 @@ import TensorFlow #endif /// Returns a single tensor containing the log of the sum of the exponentials -/// in `x`. Used for numerical stability when dealing with very small values. +/// in `x`. +/// +/// Used for numerical stability when dealing with very small values. @differentiable public func logSumExp(_ x: [Tensor]) -> Tensor { // Deal with an empty array first. @@ -32,8 +34,9 @@ public func logSumExp(_ x: [Tensor]) -> Tensor { } /// Returns a single tensor containing the log of the sum of the exponentials -/// in `lhs` and `rhs`. Used for numerical stability when dealing with very -/// small values. +/// in `lhs` and `rhs`. +/// +/// Used for numerical stability when dealing with very small values. @differentiable public func logSumExp(_ lhs: Tensor, _ rhs: Tensor) -> Tensor { return logSumExp([lhs, rhs]) @@ -71,8 +74,9 @@ public struct SemiRing: Differentiable { static var one: SemiRing { SemiRing(logp: 0.0, logr: -Float.infinity) } } -/// Multiplies `lhs` by `rhs`. Since scores are on a logarithmic scale, -/// products become sums. +/// Multiplies `lhs` by `rhs`. +/// +/// Since scores are on a logarithmic scale, products become sums. @differentiable func * (_ lhs: SemiRing, _ rhs: SemiRing) -> SemiRing { return SemiRing( @@ -109,12 +113,9 @@ extension SemiRing { extension SemiRing { - /// Returns true when `self` is within `tolerance` of `other`. This behavior - /// is modeled after SE-0259. + /// Returns true when `self` is within `tolerance` of `other`. /// - /// - Parameter other: the instance to be compared with `self`. - /// - Parameter tolerance: the amount of variability considered acceptable - /// in determining equality. + /// - Note: This behavior is modeled after SE-0259. // TODO(abdulras) see if we can use ulp as a default tolerance @inlinable public func isAlmostEqual(to other: Self, tolerance: Float) -> Bool { diff --git a/Support/Text/WordSeg/Alphabet.swift b/Support/Text/WordSeg/Alphabet.swift index ac5eb66aee4..96a5356914b 100644 --- a/Support/Text/WordSeg/Alphabet.swift +++ b/Support/Text/WordSeg/Alphabet.swift @@ -14,11 +14,13 @@ import TensorFlow -/// A collection that maps individual characters to an integer representation. +/// A mapping between individual characters and their integer representation. /// -/// In Python implementations, this is sometimes called the character vocabulary. +/// - Note: We map from String in order to support multi-character metadata +/// sequences such as ``. /// -/// - Note: We map from String in order to support multi-character metadata sequences such as ``. +/// - Note: In Python implementations, this is sometimes called the character +/// vocabulary. public struct Alphabet { /// A type whose instances represent a character. @@ -37,7 +39,7 @@ public struct Alphabet { public let pad: Int32 /// Creates an instance containing a mapping from `letters` to unique - /// integers, including markers. + /// integers, including markers `eos`, `eow`, and `pad`. public init(_ letters: C, eos: String, eow: String, pad: String) where C.Element == Character { self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...)) @@ -53,7 +55,7 @@ public struct Alphabet { } /// Creates an instance containing a mapping from `letters` to unique - /// integers, including markers. + /// integers, including markers `eos`, `eow`, and `pad`. public init(_ letters: C, eos: String, eow: String, pad: String) where C.Element == Element { self.dictionary = .init(zip(letters.lazy.map { String($0) }, 0...)) @@ -68,10 +70,10 @@ public struct Alphabet { self.dictionary[pad] = self.pad } - /// A count of the characters in the alphabet, including markers. + /// A count of unique characters, including markers. public var count: Int { return dictionary.count } - /// Accesses the `key`th element. + /// Accesses the `key`th element, returning `nil` if it does not exist. public subscript(key: String) -> Int32? { return dictionary[key] } diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index 5dcc08a0147..010518df476 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -17,26 +17,23 @@ import TensorFlow /// A sequence of characters represented by integers. public struct CharacterSequence: Hashable { - /// A collection of integers representing a sequence of characters. + /// Representing an ordered sequence of characters. public let characters: [Int32] /// A marker denoting the end of the sequence. private let eos: Int32 - /// Creates an instance without meaningful contents. + /// Creates an empty instance without meaningful contents. public init(_debug: Int) { self.characters = [] self.eos = -1 } - /// Creates a sequence from `string`, using the integers from `alphabet`, - /// appended with the end of sequence marker. + /// Creates a sequence from `string`, using `alphabet`, appended with the + /// end of sequence marker. /// - /// - Parameter alphabet: character to integer mapping. - /// - Parameter appendingEoSTo: string to be converted to a sequence of - /// integers. - /// - /// - Throws: An error of type 'CharacterErrors'. + /// - Throws: `CharacterErrors.unknownCharacter` if `string` contains a + /// character that does not exist in `alphabet`. public init(alphabet: Alphabet, appendingEoSTo string: String) throws { var characters = [Int32]() characters.reserveCapacity(string.count + 1) @@ -51,22 +48,14 @@ public struct CharacterSequence: Hashable { } /// Creates a sequence from `characters` and sets the end of sequence marker - /// from `alphabet`. - /// - /// - Parameter alphabet: character to integer mapping. - /// - Parameter characters: sequence of integers with a terminal end of - /// sequence marker. + /// from `alphabet`. private init(alphabet: Alphabet, characters: [Int32]) { self.characters = characters self.eos = alphabet.eos } - /// Creates a sequenxe from `characters` and sets the end of sequence marker + /// Creates a sequence from `characters` and sets the end of sequence marker /// from `alphabet`. - /// - /// - Parameter alphabet: character to integer mapping. - /// - Parameter characters: sequence of integers with a terminal end of - /// sequence marker. public init(alphabet: Alphabet, characters: ArraySlice) { self.characters = [Int32](characters) self.eos = alphabet.eos @@ -85,7 +74,9 @@ public struct CharacterSequence: Hashable { /// Count of characters in the sequence, including the end marker. public var count: Int { return characters.count } - /// The last character in the sequence, i.e. the end marker. + /// The last character in the sequence, if `characters` is not empty. + /// + /// - Note: This is usually the end marker. public var last: Int32? { return characters.last } /// TODO: what's happening here? @@ -96,7 +87,7 @@ public struct CharacterSequence: Hashable { extension CharacterSequence: CustomStringConvertible { - /// A string representation of the collection of integers representing the character sequence. + /// A string representation of the integers in the character sequence. public var description: String { "\(characters)" } diff --git a/Support/Text/WordSeg/Lexicon.swift b/Support/Text/WordSeg/Lexicon.swift index 9ca4f568a2d..428f6dbe9b6 100644 --- a/Support/Text/WordSeg/Lexicon.swift +++ b/Support/Text/WordSeg/Lexicon.swift @@ -14,7 +14,7 @@ import TensorFlow -/// A collection that maps character sequences to logical words. +/// Keeps track of logical words. /// /// In Python implementations, this is sometimes called the string vocabulary /// (in contrast to the character vocabulary or `Alphabet`, which maps @@ -24,35 +24,26 @@ public struct Lexicon { /// A type whose instances represent a sequence of characters. public typealias Element = CharacterSequence - /// A one-to-one mapping between a sequence of characters and unique - /// integers. + /// A one-to-one mapping between logical words and unique integers. // TODO(marcrasi): if the value is not used to construct Tensor, switch to Int public var dictionary: BijectiveDictionary /// A count of unique logical words in the lexicon. public var count: Int { return dictionary.count } - /// Creates an instance containing a mapping from `sequences` to unique - /// integers. - /// - /// - Parameter sequences: character sequences to compose the lexicon. + /// Creates an instance containing `sequences`. public init(_ sequences: C) where C.Element == Element { self.dictionary = .init(zip(sequences, 0...)) } - /// Creates an instance containing a mapping from `sequences` to unique - /// integers, using `alphabet`. Sequences are truncated at `maxLength` and - /// only those occurring `minFreq` times are included. - /// - /// - Parameter sequences: character sequences to compose the lexicon. - /// - Parameter alphabet: all characters contained in `sequences`. - /// - Parameter maxLength: sequence length at which truncation occurs. - /// - Parameter minFreq: minimum required occurrence of each sequence. + /// Creates an instance containing `sequences` using `alphabet`, truncating + /// elements at `maxLength` and including only those appearing at least + /// `minFrequency` times. public init( from sequences: [CharacterSequence], alphabet: Alphabet, maxLength: Int, - minFreq: Int + minFrequency: Int ) { var histogram: [ArraySlice: Int] = [:] @@ -69,7 +60,7 @@ public struct Lexicon { } } - let frequentWordCandidates = histogram.filter { $0.1 >= minFreq } + let frequentWordCandidates = histogram.filter { $0.1 >= minFrequency } let vocab = frequentWordCandidates.map { CharacterSequence(alphabet: alphabet, characters: $0.0) } diff --git a/Tests/SupportTests/WordSegSupportTests.swift b/Tests/SupportTests/WordSegSupportTests.swift index 78406e6a6f2..c97feda32da 100644 --- a/Tests/SupportTests/WordSegSupportTests.swift +++ b/Tests/SupportTests/WordSegSupportTests.swift @@ -72,7 +72,7 @@ class WordSegSupportTests: XCTestCase { try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "alpha"), try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "beta"), try! CharacterSequence(alphabet: alphabet, appendingEoSTo: "gamma"), - ], alphabet: alphabet, maxLength: 5, minFreq: 4) + ], alphabet: alphabet, maxLength: 5, minFrequency: 4) XCTAssertEqual(lexicon.count, 3) } diff --git a/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift b/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift index a9337bdb10c..815d8bd0c97 100644 --- a/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift +++ b/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift @@ -123,42 +123,42 @@ func almostEqual( class WordSegProbeLayerTests: XCTestCase { func testProbeEncoder() { - // chrVocab is: + // alphabet is: // 0 - a // 1 - b // 2 - // 3 - // 4 - - let chrVocab: Alphabet = Alphabet( + let alphabet: Alphabet = Alphabet( [ "a", "b", ], eos: "", eow: "", pad: "") - // strVocab is: + // lexicon is: // 0 - aaaa // 1 - bbbb // 2 - abab - let strVocab: Lexicon = Lexicon([ - CharacterSequence(alphabet: chrVocab, characters: [0, 0]), // "aa" - CharacterSequence(alphabet: chrVocab, characters: [1, 1]), // "bb" - CharacterSequence(alphabet: chrVocab, characters: [0, 1]), // "ab" - CharacterSequence(alphabet: chrVocab, characters: [1, 0]), // "ba" + let lexicon: Lexicon = Lexicon([ + CharacterSequence(alphabet: alphabet, characters: [0, 0]), // "aa" + CharacterSequence(alphabet: alphabet, characters: [1, 1]), // "bb" + CharacterSequence(alphabet: alphabet, characters: [0, 1]), // "ab" + CharacterSequence(alphabet: alphabet, characters: [1, 0]), // "ba" ]) var model = SNLM( parameters: SNLM.Parameters( - ndim: 2, - dropoutProb: 0, - chrVocab: chrVocab, - strVocab: strVocab, + hiddenSize: 2, + dropoutProbability: 0, + alphabet: alphabet, + lexicon: lexicon, order: 5)) model.setParameters(Example1.parameters) print("Encoding") let encoderStates = model.encode( - CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1])) // "abab" + CharacterSequence(alphabet: alphabet, characters: [0, 1, 0, 1])) // "abab" let encoderStatesTensor = Tensor(stacking: encoderStates) print("Expected: \(Example1.expectedEncoding)") print("Actual: \(encoderStatesTensor)") @@ -184,8 +184,8 @@ class WordSegProbeLayerTests: XCTestCase { print("Decode") let decoded = model.decode( [ - CharacterSequence(alphabet: chrVocab, characters: [0, 0, 0]), // "aaa" - CharacterSequence(alphabet: chrVocab, characters: [0, 1]), // "ab" + CharacterSequence(alphabet: alphabet, characters: [0, 0, 0]), // "aaa" + CharacterSequence(alphabet: alphabet, characters: [0, 1]), // "ab" ], encoderStates[0] ) @@ -195,7 +195,7 @@ class WordSegProbeLayerTests: XCTestCase { print("OK!\n") print("Build Lattice") - let abab = CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1]) + let abab = CharacterSequence(alphabet: alphabet, characters: [0, 1, 0, 1]) let lattice = model.buildLattice(abab, maxLen: 5) XCTAssert(lattice.isAlmostEqual(to: Example1.lattice, tolerance: 1e-5)) From 31fc09b3c9eda1513cd38c5ff9a989376014992c Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 16:26:37 -0400 Subject: [PATCH 14/30] Lint --- Models/Text/WordSeg/Model.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift index 03e608be7cf..39af6fb43d7 100644 --- a/Models/Text/WordSeg/Model.swift +++ b/Models/Text/WordSeg/Model.swift @@ -139,7 +139,8 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable { inputSize: parameters.hiddenSize, hiddenSize: parameters.hiddenSize)) - self.decoderDense = Dense(inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count) + self.decoderDense = Dense( + inputSize: parameters.hiddenSize, outputSize: parameters.alphabet.count) // Other layers self.dropout = Dropout(probability: parameters.dropoutProbability) From e80dfc2fd9ad0465f54de12c63f263ba5b3c34d2 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 20:34:30 +0000 Subject: [PATCH 15/30] Clarify lattice summary. --- Models/Text/WordSeg/Lattice.swift | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Models/Text/WordSeg/Lattice.swift b/Models/Text/WordSeg/Lattice.swift index ccecfe53dc1..5ede8495508 100644 --- a/Models/Text/WordSeg/Lattice.swift +++ b/Models/Text/WordSeg/Lattice.swift @@ -24,8 +24,9 @@ import TensorFlow #endif /// A structure used for scoring all possible segmentations of a character -/// sequence. The path with the best score provides the most likely -/// segmentation at inference. +/// sequence. +/// +/// The path with the best score provides the most likely segmentation. public struct Lattice: Differentiable { /// Represents a word. From 09e82b5e18f1c0a3e4391246721770fb501e1f7c Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 2 Jun 2020 21:10:32 +0000 Subject: [PATCH 16/30] Summary refinement --- Support/Text/WordSeg/CharacterSequence.swift | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index 010518df476..3f35773686e 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -17,7 +17,7 @@ import TensorFlow /// A sequence of characters represented by integers. public struct CharacterSequence: Hashable { - /// Representing an ordered sequence of characters. + /// Represents an ordered sequence of characters. public let characters: [Int32] /// A marker denoting the end of the sequence. @@ -30,7 +30,7 @@ public struct CharacterSequence: Hashable { } /// Creates a sequence from `string`, using `alphabet`, appended with the - /// end of sequence marker. + /// end marker. /// /// - Throws: `CharacterErrors.unknownCharacter` if `string` contains a /// character that does not exist in `alphabet`. @@ -47,15 +47,15 @@ public struct CharacterSequence: Hashable { self.init(alphabet: alphabet, characters: characters) } - /// Creates a sequence from `characters` and sets the end of sequence marker - /// from `alphabet`. + /// Creates a sequence from `characters` and sets the end marker from + /// `alphabet`. private init(alphabet: Alphabet, characters: [Int32]) { self.characters = characters self.eos = alphabet.eos } - /// Creates a sequence from `characters` and sets the end of sequence marker - /// from `alphabet`. + /// Creates a sequence from `characters` and sets the end marker from + /// `alphabet`. public init(alphabet: Alphabet, characters: ArraySlice) { self.characters = [Int32](characters) self.eos = alphabet.eos From d94a45e6b82a1e28c436e9d546640ab6aaca6bf2 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Fri, 5 Jun 2020 01:37:04 +0000 Subject: [PATCH 17/30] Clarify end marker behavior and assumptions --- Support/Text/WordSeg/CharacterSequence.swift | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift index 3f35773686e..30def797169 100644 --- a/Support/Text/WordSeg/CharacterSequence.swift +++ b/Support/Text/WordSeg/CharacterSequence.swift @@ -49,6 +49,8 @@ public struct CharacterSequence: Hashable { /// Creates a sequence from `characters` and sets the end marker from /// `alphabet`. + /// + /// - Note: Assumes `characters` contains an end marker. private init(alphabet: Alphabet, characters: [Int32]) { self.characters = characters self.eos = alphabet.eos @@ -56,6 +58,8 @@ public struct CharacterSequence: Hashable { /// Creates a sequence from `characters` and sets the end marker from /// `alphabet`. + /// + /// - Note: Assumes `characters` contains an end marker. public init(alphabet: Alphabet, characters: ArraySlice) { self.characters = [Int32](characters) self.eos = alphabet.eos @@ -79,7 +83,8 @@ public struct CharacterSequence: Hashable { /// - Note: This is usually the end marker. public var last: Int32? { return characters.last } - /// TODO: what's happening here? + /// Representation for character generation, with the end marker moved to + /// the beginning. public var tensor: Tensor { Tensor([self.eos] + characters[0.. Date: Tue, 9 Jun 2020 15:50:18 +0000 Subject: [PATCH 18/30] Rename ReferenceArchive to DownloadableArchive Change members to lets --- Datasets/WordSeg/WordSegDataset.swift | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index f45a3374007..5c108134ceb 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -36,19 +36,19 @@ public struct WordSegDataset { public let alphabet: Alphabet /// A pointer to source data. - private struct ReferenceArchive { + private struct DownloadableArchive { /// The location of the archive. - var location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! + let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! /// The path to the test source. - var testingFilePath = "br/br-text/te.txt" + let testingFilePath = "br/br-text/te.txt" /// The path to the training source. - var trainingFilePath = "br/br-text/tr.txt" + let trainingFilePath = "br/br-text/tr.txt" /// The path to the validation source. - var validationFilePath = "br/br-text/va.txt" + let validationFilePath = "br/br-text/va.txt" } /// Returns the text of all phrases parsed from `data` in UTF8. @@ -131,28 +131,28 @@ public struct WordSegDataset { /// - Throws: an error in the Cocoa domain, if the default training file /// cannot be read. public init() throws { - let referenceArchive = ReferenceArchive() + let downloadableArchive = DownloadableArchive() let localStorageDirectory: URL = DatasetUtilities.defaultDirectory .appendingPathComponent("WordSeg", isDirectory: true) WordSegDataset.downloadIfNotPresent( - to: localStorageDirectory, referenceArchive: referenceArchive) + to: localStorageDirectory, downloadableArchive: downloadableArchive) let archiveFileName = - referenceArchive + downloadableArchive .location.deletingPathExtension().lastPathComponent let archiveDirectory = localStorageDirectory .appendingPathComponent(archiveFileName) let trainingFilePath = archiveDirectory - .appendingPathComponent(referenceArchive.trainingFilePath).path + .appendingPathComponent(downloadableArchive.trainingFilePath).path let validationFilePath = archiveDirectory - .appendingPathComponent(referenceArchive.validationFilePath).path + .appendingPathComponent(downloadableArchive.validationFilePath).path let testingFilePath = archiveDirectory - .appendingPathComponent(referenceArchive.testingFilePath).path + .appendingPathComponent(downloadableArchive.testingFilePath).path try self.init( training: trainingFilePath, validation: validationFilePath, @@ -226,10 +226,10 @@ public struct WordSegDataset { self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) } - /// Downloads and unpacks `referenceArchive` to `directory` if it does not + /// Downloads and unpacks `downloadableArchive` to `directory` if it does not /// exist locally. private static func downloadIfNotPresent( - to directory: URL, referenceArchive: ReferenceArchive + to directory: URL, downloadableArchive: DownloadableArchive ) { let downloadPath = directory.path let directoryExists = FileManager.default.fileExists(atPath: downloadPath) @@ -238,9 +238,9 @@ public struct WordSegDataset { guard !directoryExists || directoryEmpty else { return } - let remoteRoot = referenceArchive.location.deletingLastPathComponent() - let filename = referenceArchive.location.deletingPathExtension().lastPathComponent - let fileExtension = referenceArchive.location.pathExtension + let remoteRoot = downloadableArchive.location.deletingLastPathComponent() + let filename = downloadableArchive.location.deletingPathExtension().lastPathComponent + let fileExtension = downloadableArchive.location.pathExtension // Downloads and extracts dataset files. let _ = DatasetUtilities.downloadResource( From c9ffce653ac1571bfdb073a27c0bcb63114bb93f Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 9 Jun 2020 08:52:05 -0700 Subject: [PATCH 19/30] Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams --- Datasets/WordSeg/WordSegDataset.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 5c108134ceb..29bdfb6d446 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -41,7 +41,7 @@ public struct WordSegDataset { /// The location of the archive. let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! - /// The path to the test source. + /// The path to the test source within the unpacked archive. let testingFilePath = "br/br-text/te.txt" /// The path to the training source. From 80e575a72a1d464bbc28b4c157d43aa3b674f805 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 9 Jun 2020 08:52:17 -0700 Subject: [PATCH 20/30] Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams --- Datasets/WordSeg/WordSegDataset.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 29bdfb6d446..43456160f84 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -38,7 +38,8 @@ public struct WordSegDataset { /// A pointer to source data. private struct DownloadableArchive { - /// The location of the archive. + /// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked + /// into data files described by other properties of `self`. let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! /// The path to the test source within the unpacked archive. From fb5e5d73c3b89d9493486d300beddd9c1740d2e4 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 9 Jun 2020 16:38:42 +0000 Subject: [PATCH 21/30] Remove implied text from comments with phrase. --- Datasets/WordSeg/WordSegDataset.swift | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 43456160f84..f2ddc367f2b 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -23,13 +23,13 @@ import ModelSupport /// https://www.aclweb.org/anthology/P19-1645.pdf. public struct WordSegDataset { - /// The text used for training. + /// The training data. public let trainingPhrases: [Phrase] - /// The text used for testing. + /// The test data. public private(set) var testingPhrases: [Phrase]? - /// The text used for validation. + /// The validation data. public private(set) var validationPhrases: [Phrase]? /// The union of all characters in the included dataset. @@ -42,17 +42,17 @@ public struct WordSegDataset { /// into data files described by other properties of `self`. let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! - /// The path to the test source within the unpacked archive. + /// The path to the test data within the unpacked archive. let testingFilePath = "br/br-text/te.txt" - /// The path to the training source. + /// The path to the training data within the unpacked archive. let trainingFilePath = "br/br-text/tr.txt" - /// The path to the validation source. + /// The path to the validation data within the unpacked archive. let validationFilePath = "br/br-text/va.txt" } - /// Returns the text of all phrases parsed from `data` in UTF8. + /// Returns all phrases parsed from `data` in UTF8. private static func load(data: Data) -> [String] { guard let contents: String = String(data: data, encoding: .utf8) else { return [] @@ -60,7 +60,7 @@ public struct WordSegDataset { return load(contents: contents) } - /// Returns the text of all phrases from `contents`. + /// Returns all phrases from `contents`. private static func load(contents: String) -> [String] { var strings = [String]() @@ -127,7 +127,7 @@ public struct WordSegDataset { return phrases } - /// Creates an instance containing phrases from the default location. + /// Creates an instance containing phrases from the reference archive. /// /// - Throws: an error in the Cocoa domain, if the default training file /// cannot be read. From 92b58e943d3be86db528b2e5d71f5036e1466e4c Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 9 Jun 2020 18:59:15 +0000 Subject: [PATCH 22/30] Remove Foundation string processing Remove unnecessary additional `load()` Remove unnecessary optional from `testingPhrases` and `validationPhrases` Simplify optional filename handling in init() Remove extra `)` from training loss output Add test for loading only training file --- Datasets/WordSeg/WordSegDataset.swift | 54 ++++++------------- Examples/WordSeg/main.swift | 8 +-- .../WordSeg/WordSegDatasetTests.swift | 31 ++++++++--- 3 files changed, 44 insertions(+), 49 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index f2ddc367f2b..979c5d64de8 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -27,10 +27,10 @@ public struct WordSegDataset { public let trainingPhrases: [Phrase] /// The test data. - public private(set) var testingPhrases: [Phrase]? + public private(set) var testingPhrases: [Phrase] /// The validation data. - public private(set) var validationPhrases: [Phrase]? + public private(set) var validationPhrases: [Phrase] /// The union of all characters in the included dataset. public let alphabet: Alphabet @@ -52,24 +52,11 @@ public struct WordSegDataset { let validationFilePath = "br/br-text/va.txt" } - /// Returns all phrases parsed from `data` in UTF8. + /// Returns phrases parsed from `data` in UTF8, separated by newlines. private static func load(data: Data) -> [String] { - guard let contents: String = String(data: data, encoding: .utf8) else { - return [] - } - return load(contents: contents) - } - - /// Returns all phrases from `contents`. - private static func load(contents: String) -> [String] { - var strings = [String]() - - for line in contents.components(separatedBy: .newlines) { - let trimmed = line.trimmingCharacters(in: .whitespaces) - if trimmed.isEmpty { continue } - strings.append(trimmed) - } - return strings + let contents = String(decoding: data, as: Unicode.UTF8.self) + let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true) + return splitContents.map { String($0) } } /// Returns the union of all characters in `training` and `otherSequences`. @@ -113,7 +100,7 @@ public struct WordSegDataset { var phrases = [Phrase]() for data in dataset { - let trimmed = data.components(separatedBy: .whitespaces).joined() + let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined() guard let numericalizedText = try? CharacterSequence( alphabet: alphabet, appendingEoSTo: trimmed) @@ -175,26 +162,15 @@ public struct WordSegDataset { options: .alwaysMapped) let training = Self.load(data: trainingData) - let validation: [String] - let testing: [String] - - if let validationFile = validationFile { - let data = try Data( - contentsOf: URL(fileURLWithPath: validationFile), - options: .alwaysMapped) - validation = Self.load(data: data) - } else { - validation = [String]() - } + let validationData = try Data( + contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"), + options: .alwaysMapped) + let validation = Self.load(data: validationData) - if let testingFile = testingFile { - let data: Data = try Data( - contentsOf: URL(fileURLWithPath: testingFile), - options: .alwaysMapped) - testing = Self.load(data: data) - } else { - testing = [String]() - } + let testingData = try Data( + contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"), + options: .alwaysMapped) + let testing = Self.load(data: testingData) self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift index c2b850c0589..10909cf744f 100644 --- a/Examples/WordSeg/main.swift +++ b/Examples/WordSeg/main.swift @@ -103,11 +103,11 @@ for epoch in 1...maxEpochs { trainingLossHistory.append(trainingLoss) reduceLROnPlateau(lossHistory: trainingLossHistory, optimizer: optimizer) - guard let validationPhrases = dataset.validationPhrases else { + if dataset.validationPhrases.count < 1 { print( """ [Epoch \(epoch)] \ - Training loss: \(trainingLoss)) + Training loss: \(trainingLoss) """ ) @@ -127,7 +127,7 @@ for epoch in 1...maxEpochs { var validationBatchCount = 0 var validationCharacterCount = 0 var validationPlainText: String = "" - for phrase in validationPhrases { + for phrase in dataset.validationPhrases { let sentence = phrase.numericalizedText var lattice = model.buildLattice(sentence, maxLen: maxLength) let score = lattice[sentence.count].semiringScore @@ -137,7 +137,7 @@ for epoch in 1...maxEpochs { validationCharacterCount += sentence.count // View a sample segmentation once per epoch. - if validationBatchCount == validationPhrases.count { + if validationBatchCount == dataset.validationPhrases.count { let bestPath = lattice.viterbi(sentence: phrase.numericalizedText) validationPlainText = Lattice.pathToPlainText(path: bestPath, alphabet: dataset.alphabet) } diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift index 06f073dd542..15cd2e6c75d 100644 --- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift +++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift @@ -17,12 +17,12 @@ import ModelSupport import XCTest class WordSegDatasetTests: XCTestCase { - func testCreateWordSegDataset() { + func testCreateWordSegDatasetReference() { do { let dataset = try WordSegDataset() XCTAssertEqual(dataset.trainingPhrases.count, 7832) - XCTAssertEqual(dataset.validationPhrases!.count, 979) - XCTAssertEqual(dataset.testingPhrases!.count, 979) + XCTAssertEqual(dataset.validationPhrases.count, 979) + XCTAssertEqual(dataset.testingPhrases.count, 979) // Check the first example in each set. let trainingExample: [Int32] = [ @@ -32,14 +32,32 @@ class WordSegDatasetTests: XCTestCase { XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample) let validationExample: [Int32] = [9, 6, 13, 13, 16, 14, 10, 14, 10, 28] - XCTAssertEqual(dataset.validationPhrases![0].numericalizedText.characters, validationExample) + XCTAssertEqual(dataset.validationPhrases[0].numericalizedText.characters, validationExample) let testingExample: [Int32] = [ 13, 6, 21, 14, 6, 20, 6, 6, 10, 7, 10, 4, 2, 15, 20, 6, 6, 2, 15, 26, 3, 16, 5, 26, 10, 15, 21, 9, 2, 21, 14, 10, 19, 19, 16, 19, 28, ] - XCTAssertEqual(dataset.testingPhrases![0].numericalizedText.characters, testingExample) + XCTAssertEqual(dataset.testingPhrases[0].numericalizedText.characters, testingExample) + } catch { + XCTFail(error.localizedDescription) + } + } + + func testCreateWordSegDatasetTrainingOnly() { + do { + let dataset = try WordSegDataset(training: "/home/michellecasbon/tmp/seg/br/br-text/tr.txt") + XCTAssertEqual(dataset.trainingPhrases.count, 7832) + XCTAssertEqual(dataset.validationPhrases.count, 0) + XCTAssertEqual(dataset.testingPhrases.count, 0) + + // Check the first example in each set. + let trainingExample: [Int32] = [ + 26, 16, 22, 24, 2, 15, 21, 21, 16, 20, 6, 6, 21, + 9, 6, 3, 16, 16, 12, 28, + ] + XCTAssertEqual(dataset.trainingPhrases[0].numericalizedText.characters, trainingExample) } catch { XCTFail(error.localizedDescription) } @@ -66,7 +84,8 @@ class WordSegDatasetTests: XCTestCase { } static var allTests = [ - ("testCreateWordSegDataset", testCreateWordSegDataset), + ("testCreateWordSegDatasetReference", testCreateWordSegDatasetReference), + ("testCreateWordSegDatasetTrainingOnly", testCreateWordSegDatasetTrainingOnly), ("testWordSegDatasetLoad", testWordSegDatasetLoad), ] } From 21fc998097ab3299a42845c8269ad4e7148a3001 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Tue, 9 Jun 2020 12:07:45 -0700 Subject: [PATCH 23/30] Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams --- Datasets/WordSeg/WordSegDataset.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 979c5d64de8..c17074d127d 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -61,9 +61,9 @@ public struct WordSegDataset { /// Returns the union of all characters in `training` and `otherSequences`. /// - /// - Parameter eos: text to be used as the end of sequence marker. - /// - Parameter eow: text to be used as the end of word marker. - /// - Parameter pad: text to be used as the padding marker. + /// - Parameter eos: the end of sequence marker. + /// - Parameter eow:the end of word marker. + /// - Parameter pad: the padding marker. private static func makeAlphabet( datasets training: [String], _ otherSequences: [String]?..., From cc3f30ea7755ec3697be928c3e5e781c8d79aeaf Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Wed, 10 Jun 2020 16:50:01 +0000 Subject: [PATCH 24/30] Remove variadic arguments in makeAlphabet Simplify and remove redundant init code --- Datasets/WordSeg/WordSegDataset.swift | 39 +++++++-------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index c17074d127d..1f402aa00a9 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -59,26 +59,22 @@ public struct WordSegDataset { return splitContents.map { String($0) } } - /// Returns the union of all characters in `training` and `otherSequences`. + /// Returns the union of all characters in `phrases`. /// /// - Parameter eos: the end of sequence marker. /// - Parameter eow:the end of word marker. /// - Parameter pad: the padding marker. private static func makeAlphabet( - datasets training: [String], - _ otherSequences: [String]?..., + phrases: [String], eos: String = "", eow: String = "", pad: String = "" ) -> Alphabet { var letters: Set = [] - for dataset in otherSequences + [training] { - guard let dataset = dataset else { continue } - for sentence in dataset { - for character in sentence { - if !character.isWhitespace { letters.insert(character) } - } + for phrase in phrases { + for character in phrase { + if !character.isWhitespace { letters.insert(character) } } } @@ -160,22 +156,17 @@ public struct WordSegDataset { let trainingData = try Data( contentsOf: URL(fileURLWithPath: trainingFile), options: .alwaysMapped) - let training = Self.load(data: trainingData) let validationData = try Data( contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"), options: .alwaysMapped) - let validation = Self.load(data: validationData) let testingData = try Data( contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"), options: .alwaysMapped) - let testing = Self.load(data: testingData) - self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) - self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) - self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) - self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) + self.init( + training: trainingData, validation: validationData, testing: testingData) } /// Creates an instance containing phrases from `trainingData`, and @@ -184,20 +175,10 @@ public struct WordSegDataset { training trainingData: Data, validation validationData: Data?, testing testingData: Data? ) { let training = Self.load(data: trainingData) - let validation: [String] - let testing: [String] - if let validationData = validationData { - validation = Self.load(data: validationData) - } else { - validation = [String]() - } - if let testingData = testingData { - testing = Self.load(data: testingData) - } else { - testing = [String]() - } + let validation = Self.load(data: validationData ?? Data()) + let testing = Self.load(data: testingData ?? Data()) - self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) + self.alphabet = Self.makeAlphabet(phrases: training + validation + testing) self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) From 361e3805a2745887f24df77d775ad3793bb4520b Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 11 Jun 2020 15:47:44 +0000 Subject: [PATCH 25/30] Rename convertDataset to numericalizeDataset --- Datasets/WordSeg/WordSegDataset.swift | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 1f402aa00a9..686e61db7a1 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -85,12 +85,11 @@ public struct WordSegDataset { return Alphabet(sorted, eos: eos, eow: eow, pad: pad) } - /// Returns phrases from `dataset`, using `alphabet`, to be used with the + /// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the /// WordSeg model. /// - /// - Note: Omits any part of the dataset that cannot be converted to - /// `CharacterSequence`. - private static func convertDataset(_ dataset: [String], alphabet: Alphabet) + /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`. + private static func numericalizeDataset(_ dataset: [String], alphabet: Alphabet) -> [Phrase] { var phrases = [Phrase]() @@ -179,9 +178,9 @@ public struct WordSegDataset { let testing = Self.load(data: testingData ?? Data()) self.alphabet = Self.makeAlphabet(phrases: training + validation + testing) - self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) - self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) - self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) + self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet) + self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet) + self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet) } /// Downloads and unpacks `downloadableArchive` to `directory` if it does not From d6546ea63418ed2e5433c266451f6a78c44aaea7 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 11 Jun 2020 17:30:25 +0000 Subject: [PATCH 26/30] Remove raw loop in makeAlphabet Rename downloadableArchive to source Preserve intermediate array type --- Datasets/WordSeg/WordSegDataset.swift | 45 +++++++++++---------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 686e61db7a1..1795f5262c1 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -53,10 +53,10 @@ public struct WordSegDataset { } /// Returns phrases parsed from `data` in UTF8, separated by newlines. - private static func load(data: Data) -> [String] { + private static func load(data: Data) -> [Substring] { let contents = String(decoding: data, as: Unicode.UTF8.self) let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true) - return splitContents.map { String($0) } + return splitContents } /// Returns the union of all characters in `phrases`. @@ -65,22 +65,15 @@ public struct WordSegDataset { /// - Parameter eow:the end of word marker. /// - Parameter pad: the padding marker. private static func makeAlphabet( - phrases: [String], + phrases: [Substring], eos: String = "", eow: String = "", pad: String = "" ) -> Alphabet { - var letters: Set = [] - - for phrase in phrases { - for character in phrase { - if !character.isWhitespace { letters.insert(character) } - } - } + let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace }) // Sort the letters to make it easier to interpret ints vs letters. - var sorted = Array(letters) - sorted.sort() + let sorted = Array(letters).sorted() return Alphabet(sorted, eos: eos, eow: eow, pad: pad) } @@ -89,7 +82,7 @@ public struct WordSegDataset { /// WordSeg model. /// /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`. - private static func numericalizeDataset(_ dataset: [String], alphabet: Alphabet) + private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet) -> [Phrase] { var phrases = [Phrase]() @@ -101,7 +94,7 @@ public struct WordSegDataset { alphabet: alphabet, appendingEoSTo: trimmed) else { continue } let phrase = Phrase( - plainText: data, + plainText: String(data), numericalizedText: numericalizedText) phrases.append(phrase) } @@ -114,28 +107,26 @@ public struct WordSegDataset { /// - Throws: an error in the Cocoa domain, if the default training file /// cannot be read. public init() throws { - let downloadableArchive = DownloadableArchive() + let source = DownloadableArchive() let localStorageDirectory: URL = DatasetUtilities.defaultDirectory .appendingPathComponent("WordSeg", isDirectory: true) WordSegDataset.downloadIfNotPresent( - to: localStorageDirectory, downloadableArchive: downloadableArchive) + to: localStorageDirectory, source: source) - let archiveFileName = - downloadableArchive - .location.deletingPathExtension().lastPathComponent + let archiveFileName = source.location.deletingPathExtension().lastPathComponent let archiveDirectory = localStorageDirectory .appendingPathComponent(archiveFileName) let trainingFilePath = archiveDirectory - .appendingPathComponent(downloadableArchive.trainingFilePath).path + .appendingPathComponent(source.trainingFilePath).path let validationFilePath = archiveDirectory - .appendingPathComponent(downloadableArchive.validationFilePath).path + .appendingPathComponent(source.validationFilePath).path let testingFilePath = archiveDirectory - .appendingPathComponent(downloadableArchive.testingFilePath).path + .appendingPathComponent(source.testingFilePath).path try self.init( training: trainingFilePath, validation: validationFilePath, @@ -183,10 +174,10 @@ public struct WordSegDataset { self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet) } - /// Downloads and unpacks `downloadableArchive` to `directory` if it does not + /// Downloads and unpacks `source` to `directory` if it does not /// exist locally. private static func downloadIfNotPresent( - to directory: URL, downloadableArchive: DownloadableArchive + to directory: URL, source: DownloadableArchive ) { let downloadPath = directory.path let directoryExists = FileManager.default.fileExists(atPath: downloadPath) @@ -195,9 +186,9 @@ public struct WordSegDataset { guard !directoryExists || directoryEmpty else { return } - let remoteRoot = downloadableArchive.location.deletingLastPathComponent() - let filename = downloadableArchive.location.deletingPathExtension().lastPathComponent - let fileExtension = downloadableArchive.location.pathExtension + let remoteRoot = source.location.deletingLastPathComponent() + let filename = source.location.deletingPathExtension().lastPathComponent + let fileExtension = source.location.pathExtension // Downloads and extracts dataset files. let _ = DatasetUtilities.downloadResource( From e94247796f71c4af4d080d8a4b17d408ce7c63e3 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 11 Jun 2020 10:34:05 -0700 Subject: [PATCH 27/30] Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams --- Datasets/WordSeg/WordSegDataset.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 1795f5262c1..20b9aa0bf39 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -32,7 +32,7 @@ public struct WordSegDataset { /// The validation data. public private(set) var validationPhrases: [Phrase] - /// The union of all characters in the included dataset. + /// A mapping between characters used in the dataset and densly-packed integers public let alphabet: Alphabet /// A pointer to source data. From 6e0ae1543f9bce9e5afad232567d6417832fd626 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 11 Jun 2020 17:35:07 +0000 Subject: [PATCH 28/30] s/densly/densely/ --- Datasets/WordSeg/WordSegDataset.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index 20b9aa0bf39..e4bcdb380bf 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -32,7 +32,7 @@ public struct WordSegDataset { /// The validation data. public private(set) var validationPhrases: [Phrase] - /// A mapping between characters used in the dataset and densly-packed integers + /// A mapping between characters used in the dataset and densely-packed integers public let alphabet: Alphabet /// A pointer to source data. From ac6436fa012229f40920422147728007974c3aa9 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Thu, 11 Jun 2020 18:05:25 +0000 Subject: [PATCH 29/30] Remove hard-coded path --- Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift index 15cd2e6c75d..09ca58e6545 100644 --- a/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift +++ b/Tests/DatasetsTests/WordSeg/WordSegDatasetTests.swift @@ -47,7 +47,10 @@ class WordSegDatasetTests: XCTestCase { func testCreateWordSegDatasetTrainingOnly() { do { - let dataset = try WordSegDataset(training: "/home/michellecasbon/tmp/seg/br/br-text/tr.txt") + let localStorageDirectory: URL = DatasetUtilities.defaultDirectory + .appendingPathComponent("WordSeg", isDirectory: true) + let trainingFile = localStorageDirectory.appendingPathComponent("/seg/br/br-text/tr.txt") + let dataset = try WordSegDataset(training: trainingFile.path) XCTAssertEqual(dataset.trainingPhrases.count, 7832) XCTAssertEqual(dataset.validationPhrases.count, 0) XCTAssertEqual(dataset.testingPhrases.count, 0) From 070d745d3f0b0453387e3a7b04baedefac5201f6 Mon Sep 17 00:00:00 2001 From: Michelle Casbon Date: Fri, 12 Jun 2020 22:27:57 +0000 Subject: [PATCH 30/30] Replace `WordSegDataset` with `Self` --- Datasets/WordSeg/WordSegDataset.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift index e4bcdb380bf..8bb166d7aa6 100644 --- a/Datasets/WordSeg/WordSegDataset.swift +++ b/Datasets/WordSeg/WordSegDataset.swift @@ -111,7 +111,7 @@ public struct WordSegDataset { let localStorageDirectory: URL = DatasetUtilities.defaultDirectory .appendingPathComponent("WordSeg", isDirectory: true) - WordSegDataset.downloadIfNotPresent( + Self.downloadIfNotPresent( to: localStorageDirectory, source: source) let archiveFileName = source.location.deletingPathExtension().lastPathComponent