Better Markov model

2018-02-08 07:41:25 -08:00
parent 46429a4596
commit 021e33b5e6
5 changed files with 52 additions and 5 deletions
--- a/src/EncodedMarkovChain.ts
+++ b/src/EncodedMarkovChain.ts
--- a/src/InferMaps.ts
+++ b/src/InferMaps.ts
@ -24,19 +24,30 @@ function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefi
    // Only classes with a certain number of properties are inferred
    // as maps.
    const numProperties = properties.size;
-    if (numProperties === 0) return undefined;
+    if (numProperties < 2) return undefined;

    if (numProperties < mapSizeThreshold) {
        const names = properties.keySeq();
-        const product = names.map(nameProbability).reduce((a, b) => a * b, 1);
+        const probabilities = names.map(nameProbability);
+        const product = probabilities.reduce((a, b) => a * b, 1);
        const probability = Math.pow(product, 1 / numProperties);
        // The idea behind this is to have a probability around 0.0004 for
        // n=1, up to around 1.0 for n=20.  I.e. when we only have a few
        // properties, they need to look really weird to infer a map, but
        // when we have more we'll accept more ordinary names.  The details
        // of the formula are immaterial because I pulled it out of my ass.
-        const limit = 0.000006 * Math.pow(numProperties + 2, 3.9);
+        const exponent = 5;
+        const scale = Math.pow(22, exponent);
+        const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
        if (probability > limit) return undefined;
+
+        /*
+        console.log(
+            `limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
+                probabilities.toArray()
+            )} is ${limit}, we are at ${probability}`
+        );
+        */
    }

    // FIXME: simplify this - it's no longer necessary with the new
--- a/src/MarkovChain.ts
+++ b/src/MarkovChain.ts
@ -123,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
    console.log(`"${word}": ${evaluate(mc, word)}`);
 }

-export function test(mc: MarkovChain): void {
+export function test(): void {
+    const mc = load();
+
    testWord(mc, "url");
    testWord(mc, "json");
    testWord(mc, "my_property");
@ -133,6 +135,16 @@ export function test(mc: MarkovChain): void {
    testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
    testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
    testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
+    testWord(mc, "granularity");
+    testWord(mc, "coverage");
+    testWord(mc, "postingFrequency");
+    testWord(mc, "dataFrequency");
+    testWord(mc, "units");
+    testWord(mc, "datasetOwner");
+    testWord(mc, "organization");
+    testWord(mc, "timePeriod");
+    testWord(mc, "contactInformation");
+
    testWord(
        mc,
        "\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"
--- a/test/generate-markov-corpus.py
+++ b/test/generate-markov-corpus.py
@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
 with open('acronyms.txt') as f:
    acronyms = f.read().splitlines()

+
 def all_lower(w):
    [word, _] = w
    return word.lower()

+
 def all_upper(w):
    [word, _] = w
    return word.upper()

+
 def capitalize(w):
    [word, _] = w
    return word[:1].upper() + word[1:].lower()

+
 def cap_and_upper_acro(w):
    [word, is_acro] = w
    if is_acro:
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
    else:
        return capitalize(w)

+
 def choice(items):
    total = sum([n for [n, _] in items])
    x = random.random()
@ -39,8 +44,12 @@ def choice(items):

 formats = [
    [3, [all_lower, all_lower, "_"]],
+    [1, [all_upper, all_upper, "_"]],
+    [2, [all_lower, all_lower, "-"]],
    [1, [all_upper, all_upper, "-"]],
+    [1, [all_lower, capitalize, "-"]],
    [1, [all_lower, all_lower, " "]],
+    [1, [capitalize, capitalize, " "]],
    [5, [all_lower, capitalize, ""]],
    [5, [all_lower, cap_and_upper_acro, ""]],
    [3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
@ -51,21 +60,27 @@ prefixes = [
    [1, "_"]
 ]

+
 def word():
    return [[random.choice(words), False]]

+
 def word_word():
    return [[random.choice(words), False], [random.choice(words), False]]

+
 def word_acronym():
    return [[random.choice(words), False], [random.choice(acronyms), True]]

+
 def acronym_word():
    return [[random.choice(acronyms), True], [random.choice(words), False]]

+
 def word_digit():
    return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]

+
 def word_acronym_digit():
    return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]

@ -78,6 +93,7 @@ generators = [
    [2, word_acronym_digit]
 ]

+
 def make_corpus_entry():
    words = choice(generators)()
    [first_format, rest_format, separator] = choice(formats)
--- a/test/make-encoded-markov-chain.sh
+++ b/test/make-encoded-markov-chain.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+./generate-markov-corpus.py >/tmp/corpus.txt
+../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
+gzip -9 /tmp/markov.json
+echo -n 'export const encodedMarkovChain = "'
+base64 /tmp/markov.json.gz | tr -d '\n'
+echo '";'