Skip to content

Commit

Permalink
Optimizations (~70x faster) - was only aiming for memory optimization…
Browse files Browse the repository at this point in the history
… earlier release
  • Loading branch information
Lundez committed Jan 5, 2020
1 parent 284f0fc commit c66dccc
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 24 deletions.
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ plugins {


group = "com.londogard"
version = "1.0-beta"
version = "1.0.1-beta"

repositories {
mavenCentral()
Expand Down
13 changes: 7 additions & 6 deletions src/main/kotlin/com/londogard/textgen/backends/BackendLM.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import java.io.File

@ImplicitReflectionSerializer
abstract class BackendLM<T> {
protected abstract val mapSerializer: KSerializer<Map<List<T>, Double>>
private val mapCharSerializer = (Char::class.serializer().list to Double::class.serializer()).map
protected abstract val mapSerializer: KSerializer<Map<String, Map<T, Double>>>
protected val stringSerializer = String::class.serializer()
protected val doubleSerializer = Double::class.serializer()
private val cborSerializer = Cbor.plain
private val padStart: Char = '\u0002'
private val padEnd: Char = '\u0003'
protected abstract var internalLanguageModel: Map<List<T>, Double>
protected abstract var internalLanguageModel: Map<String, Map<T, Double>> // Map<String, Map<T, Double>>
protected abstract val n: Int
val padEndList = List(n) { padEnd.toString() }
val padStartList = List(n) { padStart.toString() }
Expand All @@ -26,15 +27,15 @@ abstract class BackendLM<T> {

private fun getResource(path: String): InputStream = this::class.java.getResourceAsStream(path)

protected fun serializeMapToFile(name: String, map: Map<List<T>, Double>): Unit = cborSerializer
protected fun serializeMapToFile(name: String, map: Map<String, Map<T, Double>>): Unit = cborSerializer
.dump(mapSerializer, map)
.let { File(name).writeBytes(it) }

protected fun readSerializedMapFromFile(name: String): Map<List<T>, Double> = File(name)
protected fun readSerializedMapFromFile(name: String): Map<String, Map<T, Double>> = File(name)
.readBytes()
.let { cborSerializer.load(mapSerializer, it) }

protected fun readSerializedMapFromResource(name: String): Map<List<T>, Double> = getResource(name)
protected fun readSerializedMapFromResource(name: String): Map<String, Map<T, Double>> = getResource(name)
.readBytes()
.let { cborSerializer.load(mapSerializer, it) }
}
39 changes: 22 additions & 17 deletions src/main/kotlin/com/londogard/textgen/backends/NGramWordLM.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ import com.londogard.textgen.NGram
import com.londogard.textgen.ngramNormalize
import kotlinx.serialization.*
import java.io.File
import kotlin.math.min
import kotlin.math.pow
import kotlin.random.Random

@ImplicitReflectionSerializer
class NGramWordLM(
override val n: Int,
override var internalLanguageModel: Map<List<String>, Double> = emptyMap(),
override val mapSerializer: KSerializer<Map<List<String>, Double>> = (String::class.serializer().list to Double::class.serializer()).map
) :
BackendLM<String>() {
override var internalLanguageModel: Map<String, Map<String, Double>> = emptyMap()) : BackendLM<String>() {
private val stringDouble = (stringSerializer to doubleSerializer).map
override val mapSerializer: KSerializer<Map<String, Map<String, Double>>> = (stringSerializer to stringDouble).map

override fun predictNext(input: String, temperature: Double): String =
TODO("Implement this, don't forget to not remove \n etc")
Expand Down Expand Up @@ -52,11 +52,20 @@ class NGramWordLM(

val totalCount = internalModel.filterKeys { it.size == 1 }.values.sum()

internalLanguageModel = internalModel
val precomputedModel = internalModel
.mapValues { (key, value) ->
(value / (internalModel[key.dropLast(1)] ?: totalCount))
}

internalLanguageModel = precomputedModel
.entries
.groupBy( { it.key.dropLast(1).joinToString(" ") } , { it.key.last() to it.value })
.mapValues { it.value.toMap() }
//internalLanguageModel = internalModel
// .mapValues { (key, value) ->
// (0.4.pow(n - key.size)) * value / (internalModel[key.dropLast(1)] ?: totalCount)
// }

// TODO add Kneser-Ney Smooth
//val discountByN = (1..n).map { i ->
// val ndValues = modelByN[i]?.filterValues { it in listOf(1.0, 2.0) } ?: emptyMap()
Expand All @@ -71,17 +80,13 @@ class NGramWordLM(

override fun predictNext(input: List<String>, temperature: Double): String {
val history = input.takeLast(n - 1)
val options = (n downTo 1)
.asSequence()
.map { i ->
val discount = 0.4.pow(n.toDouble() - i)

internalLanguageModel
.filterKeys { it.size == i && (it.size == 1 || it.take(i - 1) == history.takeLast(i - 1)) }
.mapValues { it.value * discount }.entries
}
.map { it.sortedByDescending { l -> l.value } }
.flatten()
val keys = (min(input.size, n) downTo 0).map {
input.takeLast(it).joinToString(" ")
}

val options = keys.asSequence()
.mapNotNull { key -> internalLanguageModel[key]?.entries }
.flatMap { it.sortedByDescending { subEntry -> subEntry.value }.take(10).asSequence() }
.take(10)
.toList()

Expand All @@ -93,6 +98,6 @@ class NGramWordLM(
selection -= it.value
selection > 0
}
.first().key.last()
.first().key
}
}
Binary file modified src/main/resources/models/cardsagainst_black.cbor
Binary file not shown.
Binary file modified src/main/resources/models/cardsagainst_white.cbor
Binary file not shown.
Binary file modified src/main/resources/models/shakespeare.cbor
Binary file not shown.

0 comments on commit c66dccc

Please sign in to comment.