-
Notifications
You must be signed in to change notification settings - Fork 149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WordSeg] Add inline documentation #566
Changes from 17 commits
3f7a6f0
2d42404
bdf71a9
d44b5bf
cf86c8c
8c3812e
d36a20c
8a8b513
951a98b
4de6e7d
060e88b
04c58bb
079c47f
711b3c8
31fc09b
270f2fd
e80dfc2
09e82b5
ec21c4e
d94a45e
4ca9ba9
6881b58
c9ffce6
80e575a
fb5e5d7
92b58e9
21fc998
cc3f30e
6ddabe7
361e380
d6546ea
e942477
6e0ae15
ac6436f
070d745
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,10 +14,17 @@ | |
|
||
import ModelSupport | ||
|
||
public struct WordSegRecord { | ||
/// A sequence of text for use in word segmentation. | ||
public struct Phrase { | ||
|
||
/// A raw, unprocessed sequence of text. | ||
public let plainText: String | ||
|
||
/// A sequence of text in numeric form, derived from `plainText`. | ||
public let numericalizedText: CharacterSequence | ||
|
||
/// Creates an instance containing both raw (`plainText`) and processed | ||
/// (`numericalizedText`) forms of a sequence of text. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've always thought “numericalized” read terribly awkwardizedly. I get the impression this is a term of art, but we should discuss whether it's the best choice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created #598 |
||
public init(plainText: String, numericalizedText: CharacterSequence) { | ||
self.plainText = plainText | ||
self.numericalizedText = numericalizedText | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,28 +15,51 @@ | |
import Foundation | ||
import ModelSupport | ||
|
||
/// A dataset targeted at the problem of word segmentation. | ||
/// | ||
/// The reference archive was published in the paper "Learning to Discover, | ||
/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya | ||
/// Kawakami, Chris Dyer, and Phil Blunsom: | ||
/// https://www.aclweb.org/anthology/P19-1645.pdf. | ||
public struct WordSegDataset { | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public let training: [WordSegRecord] | ||
public private(set) var testing: [WordSegRecord]? | ||
public private(set) var validation: [WordSegRecord]? | ||
|
||
/// The text used for training. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public let trainingPhrases: [Phrase] | ||
|
||
/// The text used for testing. | ||
public private(set) var testingPhrases: [Phrase]? | ||
|
||
/// The text used for validation. | ||
public private(set) var validationPhrases: [Phrase]? | ||
|
||
/// The union of all characters in the included dataset. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public let alphabet: Alphabet | ||
|
||
private struct DownloadDetails { | ||
var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")! | ||
var archiveFileName = "seg" | ||
var archiveExtension = "zip" | ||
/// A pointer to source data. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private struct ReferenceArchive { | ||
|
||
/// The location of the archive. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! | ||
|
||
/// The path to the test source. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var testingFilePath = "br/br-text/te.txt" | ||
|
||
/// The path to the training source. | ||
var trainingFilePath = "br/br-text/tr.txt" | ||
|
||
/// The path to the validation source. | ||
var validationFilePath = "br/br-text/va.txt" | ||
} | ||
|
||
private static func load(data: Data) throws -> [String] { | ||
/// Returns the text of all phrases parsed from `data` in UTF8. | ||
private static func load(data: Data) -> [String] { | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
guard let contents: String = String(data: data, encoding: .utf8) else { | ||
throw CharacterErrors.nonUtf8Data | ||
return [] | ||
} | ||
return load(contents: contents) | ||
} | ||
|
||
/// Returns the text of all phrases from `contents`. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private static func load(contents: String) -> [String] { | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var strings = [String]() | ||
|
||
|
@@ -48,6 +71,11 @@ public struct WordSegDataset { | |
return strings | ||
} | ||
|
||
/// Returns the union of all characters in `training` and `otherSequences`. | ||
/// | ||
/// - Parameter eos: text to be used as the end of sequence marker. | ||
/// - Parameter eow: text to be used as the end of word marker. | ||
/// - Parameter pad: text to be used as the padding marker. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private static func makeAlphabet( | ||
datasets training: [String], | ||
_ otherSequences: [String]?..., | ||
|
@@ -73,52 +101,69 @@ public struct WordSegDataset { | |
return Alphabet(sorted, eos: eos, eow: eow, pad: pad) | ||
} | ||
|
||
private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws | ||
-> [WordSegRecord] | ||
/// Returns phrases from `dataset`, using `alphabet`, to be used with the | ||
/// WordSeg model. | ||
/// | ||
/// - Note: Omits any part of the dataset that cannot be converted to | ||
/// `CharacterSequence`. | ||
private static func convertDataset(_ dataset: [String], alphabet: Alphabet) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, you're right, there's loads to work on here; I have 1000 thoughts about this: is it just numericalizing the strings in dataset, and don't we just want to name the function after that? Should it not be an extension on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed to |
||
-> [Phrase] | ||
{ | ||
return try dataset.map { | ||
let trimmed = $0.components(separatedBy: .whitespaces).joined() | ||
return try WordSegRecord( | ||
plainText: $0, | ||
numericalizedText: CharacterSequence( | ||
alphabet: alphabet, appendingEoSTo: trimmed)) | ||
var phrases = [Phrase]() | ||
|
||
for data in dataset { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dabrahams Same question here about how to compose this in a way that removes the raw loop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How could I use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels like a symptom of awkward design. I believe rethinking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. // untested of course
return dataset.compactMap { data in
let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
let numericalizedText = try? CharacterSequence(alphabet: alphabet, appendingEoSTo: trimmed)
return numericalizedText.map { Phrase(plainText: String(data), numericalizedText: $0 }
} |
||
let trimmed = data.components(separatedBy: .whitespaces).joined() | ||
guard | ||
let numericalizedText = try? CharacterSequence( | ||
alphabet: alphabet, appendingEoSTo: trimmed) | ||
else { continue } | ||
let phrase = Phrase( | ||
plainText: data, | ||
numericalizedText: numericalizedText) | ||
phrases.append(phrase) | ||
} | ||
} | ||
private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws | ||
-> [WordSegRecord]? | ||
{ | ||
if let ds = dataset { | ||
let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function | ||
return tmp | ||
} | ||
return nil | ||
|
||
return phrases | ||
} | ||
|
||
/// Creates an instance containing phrases from the default location. | ||
/// | ||
/// - Throws: an error in the Cocoa domain, if the default training file | ||
/// cannot be read. | ||
public init() throws { | ||
let downloadDetails = DownloadDetails() | ||
let referenceArchive = ReferenceArchive() | ||
let localStorageDirectory: URL = DatasetUtilities.defaultDirectory | ||
.appendingPathComponent("WordSeg", isDirectory: true) | ||
|
||
WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails) | ||
WordSegDataset.downloadIfNotPresent( | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
to: localStorageDirectory, referenceArchive: referenceArchive) | ||
|
||
let archiveFileName = | ||
referenceArchive | ||
.location.deletingPathExtension().lastPathComponent | ||
let archiveDirectory = | ||
localStorageDirectory | ||
.appendingPathComponent(downloadDetails.archiveFileName) | ||
.appendingPathComponent(archiveFileName) | ||
let trainingFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.trainingFilePath).path | ||
.appendingPathComponent(referenceArchive.trainingFilePath).path | ||
let validationFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.validationFilePath).path | ||
.appendingPathComponent(referenceArchive.validationFilePath).path | ||
let testingFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.testingFilePath).path | ||
.appendingPathComponent(referenceArchive.testingFilePath).path | ||
|
||
try self.init( | ||
training: trainingFilePath, validation: validationFilePath, | ||
testing: testingFilePath) | ||
} | ||
|
||
/// Creates an instance containing phrases from `trainingFile`, and | ||
/// optionally `validationFile` and `testingFile`. | ||
/// | ||
/// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be | ||
/// read. | ||
public init( | ||
training trainingFile: String, | ||
validation validationFile: String? = nil, | ||
|
@@ -127,53 +172,64 @@ public struct WordSegDataset { | |
let trainingData = try Data( | ||
contentsOf: URL(fileURLWithPath: trainingFile), | ||
options: .alwaysMapped) | ||
let training = try Self.load(data: trainingData) | ||
let training = Self.load(data: trainingData) | ||
|
||
var validation: [String]? = nil | ||
var testing: [String]? = nil | ||
let validation: [String] | ||
let testing: [String] | ||
|
||
if let validationFile = validationFile { | ||
let data = try Data( | ||
contentsOf: URL(fileURLWithPath: validationFile), | ||
options: .alwaysMapped) | ||
validation = try Self.load(data: data) | ||
validation = Self.load(data: data) | ||
} else { | ||
validation = [String]() | ||
} | ||
|
||
if let testingFile = testingFile { | ||
let data: Data = try Data( | ||
contentsOf: URL(fileURLWithPath: testingFile), | ||
options: .alwaysMapped) | ||
testing = try Self.load(data: data) | ||
testing = Self.load(data: data) | ||
} else { | ||
testing = [String]() | ||
} | ||
|
||
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) | ||
self.training = try Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) | ||
self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) | ||
} | ||
|
||
/// Creates an instance containing phrases from `trainingData`, and | ||
/// optionally `validationData` and `testingData`. | ||
public init( | ||
training trainingData: Data, validation validationData: Data?, testing testingData: Data? | ||
) | ||
throws | ||
{ | ||
let training = try Self.load(data: trainingData) | ||
var validation: [String]? = nil | ||
var testing: [String]? = nil | ||
) { | ||
let training = Self.load(data: trainingData) | ||
let validation: [String] | ||
let testing: [String] | ||
if let validationData = validationData { | ||
validation = try Self.load(data: validationData) | ||
validation = Self.load(data: validationData) | ||
} else { | ||
validation = [String]() | ||
} | ||
if let testingData = testingData { | ||
testing = try Self.load(data: testingData) | ||
testing = Self.load(data: testingData) | ||
} else { | ||
testing = [String]() | ||
} | ||
|
||
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) | ||
self.training = try Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) | ||
self.trainingPhrases = Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validationPhrases = Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testingPhrases = Self.convertDataset(testing, alphabet: self.alphabet) | ||
} | ||
|
||
/// Downloads and unpacks `referenceArchive` to `directory` if it does not | ||
/// exist locally. | ||
private static func downloadIfNotPresent( | ||
to directory: URL, downloadDetails: DownloadDetails | ||
to directory: URL, referenceArchive: ReferenceArchive | ||
) { | ||
let downloadPath = directory.path | ||
let directoryExists = FileManager.default.fileExists(atPath: downloadPath) | ||
|
@@ -182,11 +238,15 @@ public struct WordSegDataset { | |
|
||
guard !directoryExists || directoryEmpty else { return } | ||
|
||
let remoteRoot = referenceArchive.location.deletingLastPathComponent() | ||
let filename = referenceArchive.location.deletingPathExtension().lastPathComponent | ||
let fileExtension = referenceArchive.location.pathExtension | ||
|
||
// Downloads and extracts dataset files. | ||
let _ = DatasetUtilities.downloadResource( | ||
filename: downloadDetails.archiveFileName, | ||
fileExtension: downloadDetails.archiveExtension, | ||
remoteRoot: downloadDetails.archiveLocation, | ||
filename: filename, | ||
fileExtension: fileExtension, | ||
remoteRoot: remoteRoot, | ||
localStorageDirectory: directory, extract: true) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I remember wanting to mention this before, but if “Character” doesn't mean what in Swift is called
Character
, we should look for other names, e.g. “glyph.”There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As a start, I created #600 for using
Character
instead ofString
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't exactly address this, but heads in the direction of cleaning up that design overall.