Better Markov model

This commit is contained in:
Mark Probst
2018-02-08 07:41:25 -08:00
parent 46429a4596
commit 021e33b5e6
5 changed files with 52 additions and 5 deletions

File diff suppressed because one or more lines are too long

View File

@ -24,19 +24,30 @@ function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefi
// Only classes with a certain number of properties are inferred
// as maps.
const numProperties = properties.size;
if (numProperties === 0) return undefined;
if (numProperties < 2) return undefined;
if (numProperties < mapSizeThreshold) {
const names = properties.keySeq();
const product = names.map(nameProbability).reduce((a, b) => a * b, 1);
const probabilities = names.map(nameProbability);
const product = probabilities.reduce((a, b) => a * b, 1);
const probability = Math.pow(product, 1 / numProperties);
// The idea behind this is to have a probability around 0.0004 for
// n=1, up to around 1.0 for n=20. I.e. when we only have a few
// properties, they need to look really weird to infer a map, but
// when we have more we'll accept more ordinary names. The details
// of the formula are immaterial because I pulled it out of my ass.
const limit = 0.000006 * Math.pow(numProperties + 2, 3.9);
const exponent = 5;
const scale = Math.pow(22, exponent);
const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
if (probability > limit) return undefined;
/*
console.log(
`limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
probabilities.toArray()
)} is ${limit}, we are at ${probability}`
);
*/
}
// FIXME: simplify this - it's no longer necessary with the new

View File

@ -123,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
console.log(`"${word}": ${evaluate(mc, word)}`);
}
export function test(mc: MarkovChain): void {
export function test(): void {
const mc = load();
testWord(mc, "url");
testWord(mc, "json");
testWord(mc, "my_property");
@ -133,6 +135,16 @@ export function test(mc: MarkovChain): void {
testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
testWord(mc, "granularity");
testWord(mc, "coverage");
testWord(mc, "postingFrequency");
testWord(mc, "dataFrequency");
testWord(mc, "units");
testWord(mc, "datasetOwner");
testWord(mc, "organization");
testWord(mc, "timePeriod");
testWord(mc, "contactInformation");
testWord(
mc,
"\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"

View File

@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
with open('acronyms.txt') as f:
acronyms = f.read().splitlines()
def all_lower(w):
[word, _] = w
return word.lower()
def all_upper(w):
[word, _] = w
return word.upper()
def capitalize(w):
[word, _] = w
return word[:1].upper() + word[1:].lower()
def cap_and_upper_acro(w):
[word, is_acro] = w
if is_acro:
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
else:
return capitalize(w)
def choice(items):
total = sum([n for [n, _] in items])
x = random.random()
@ -39,8 +44,12 @@ def choice(items):
formats = [
[3, [all_lower, all_lower, "_"]],
[1, [all_upper, all_upper, "_"]],
[2, [all_lower, all_lower, "-"]],
[1, [all_upper, all_upper, "-"]],
[1, [all_lower, capitalize, "-"]],
[1, [all_lower, all_lower, " "]],
[1, [capitalize, capitalize, " "]],
[5, [all_lower, capitalize, ""]],
[5, [all_lower, cap_and_upper_acro, ""]],
[3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
@ -51,21 +60,27 @@ prefixes = [
[1, "_"]
]
def word():
return [[random.choice(words), False]]
def word_word():
return [[random.choice(words), False], [random.choice(words), False]]
def word_acronym():
return [[random.choice(words), False], [random.choice(acronyms), True]]
def acronym_word():
return [[random.choice(acronyms), True], [random.choice(words), False]]
def word_digit():
return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]
def word_acronym_digit():
return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]
@ -78,6 +93,7 @@ generators = [
[2, word_acronym_digit]
]
def make_corpus_entry():
words = choice(generators)()
[first_format, rest_format, separator] = choice(formats)

View File

@ -0,0 +1,8 @@
#!/bin/bash
./generate-markov-corpus.py >/tmp/corpus.txt
../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
gzip -9 /tmp/markov.json
echo -n 'export const encodedMarkovChain = "'
base64 /tmp/markov.json.gz | tr -d '\n'
echo '";'