Better Markov model
This commit is contained in:
File diff suppressed because one or more lines are too long
@ -24,19 +24,30 @@ function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefi
|
||||
// Only classes with a certain number of properties are inferred
|
||||
// as maps.
|
||||
const numProperties = properties.size;
|
||||
if (numProperties === 0) return undefined;
|
||||
if (numProperties < 2) return undefined;
|
||||
|
||||
if (numProperties < mapSizeThreshold) {
|
||||
const names = properties.keySeq();
|
||||
const product = names.map(nameProbability).reduce((a, b) => a * b, 1);
|
||||
const probabilities = names.map(nameProbability);
|
||||
const product = probabilities.reduce((a, b) => a * b, 1);
|
||||
const probability = Math.pow(product, 1 / numProperties);
|
||||
// The idea behind this is to have a probability around 0.0004 for
|
||||
// n=1, up to around 1.0 for n=20. I.e. when we only have a few
|
||||
// properties, they need to look really weird to infer a map, but
|
||||
// when we have more we'll accept more ordinary names. The details
|
||||
// of the formula are immaterial because I pulled it out of my ass.
|
||||
const limit = 0.000006 * Math.pow(numProperties + 2, 3.9);
|
||||
const exponent = 5;
|
||||
const scale = Math.pow(22, exponent);
|
||||
const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
|
||||
if (probability > limit) return undefined;
|
||||
|
||||
/*
|
||||
console.log(
|
||||
`limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
|
||||
probabilities.toArray()
|
||||
)} is ${limit}, we are at ${probability}`
|
||||
);
|
||||
*/
|
||||
}
|
||||
|
||||
// FIXME: simplify this - it's no longer necessary with the new
|
||||
|
@ -123,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
|
||||
console.log(`"${word}": ${evaluate(mc, word)}`);
|
||||
}
|
||||
|
||||
export function test(mc: MarkovChain): void {
|
||||
export function test(): void {
|
||||
const mc = load();
|
||||
|
||||
testWord(mc, "url");
|
||||
testWord(mc, "json");
|
||||
testWord(mc, "my_property");
|
||||
@ -133,6 +135,16 @@ export function test(mc: MarkovChain): void {
|
||||
testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
|
||||
testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
|
||||
testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
|
||||
testWord(mc, "granularity");
|
||||
testWord(mc, "coverage");
|
||||
testWord(mc, "postingFrequency");
|
||||
testWord(mc, "dataFrequency");
|
||||
testWord(mc, "units");
|
||||
testWord(mc, "datasetOwner");
|
||||
testWord(mc, "organization");
|
||||
testWord(mc, "timePeriod");
|
||||
testWord(mc, "contactInformation");
|
||||
|
||||
testWord(
|
||||
mc,
|
||||
"\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"
|
||||
|
@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
|
||||
with open('acronyms.txt') as f:
|
||||
acronyms = f.read().splitlines()
|
||||
|
||||
|
||||
def all_lower(w):
|
||||
[word, _] = w
|
||||
return word.lower()
|
||||
|
||||
|
||||
def all_upper(w):
|
||||
[word, _] = w
|
||||
return word.upper()
|
||||
|
||||
|
||||
def capitalize(w):
|
||||
[word, _] = w
|
||||
return word[:1].upper() + word[1:].lower()
|
||||
|
||||
|
||||
def cap_and_upper_acro(w):
|
||||
[word, is_acro] = w
|
||||
if is_acro:
|
||||
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
|
||||
else:
|
||||
return capitalize(w)
|
||||
|
||||
|
||||
def choice(items):
|
||||
total = sum([n for [n, _] in items])
|
||||
x = random.random()
|
||||
@ -39,8 +44,12 @@ def choice(items):
|
||||
|
||||
formats = [
|
||||
[3, [all_lower, all_lower, "_"]],
|
||||
[1, [all_upper, all_upper, "_"]],
|
||||
[2, [all_lower, all_lower, "-"]],
|
||||
[1, [all_upper, all_upper, "-"]],
|
||||
[1, [all_lower, capitalize, "-"]],
|
||||
[1, [all_lower, all_lower, " "]],
|
||||
[1, [capitalize, capitalize, " "]],
|
||||
[5, [all_lower, capitalize, ""]],
|
||||
[5, [all_lower, cap_and_upper_acro, ""]],
|
||||
[3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
|
||||
@ -51,21 +60,27 @@ prefixes = [
|
||||
[1, "_"]
|
||||
]
|
||||
|
||||
|
||||
def word():
|
||||
return [[random.choice(words), False]]
|
||||
|
||||
|
||||
def word_word():
|
||||
return [[random.choice(words), False], [random.choice(words), False]]
|
||||
|
||||
|
||||
def word_acronym():
|
||||
return [[random.choice(words), False], [random.choice(acronyms), True]]
|
||||
|
||||
|
||||
def acronym_word():
|
||||
return [[random.choice(acronyms), True], [random.choice(words), False]]
|
||||
|
||||
|
||||
def word_digit():
|
||||
return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]
|
||||
|
||||
|
||||
def word_acronym_digit():
|
||||
return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]
|
||||
|
||||
@ -78,6 +93,7 @@ generators = [
|
||||
[2, word_acronym_digit]
|
||||
]
|
||||
|
||||
|
||||
def make_corpus_entry():
|
||||
words = choice(generators)()
|
||||
[first_format, rest_format, separator] = choice(formats)
|
||||
|
8
test/make-encoded-markov-chain.sh
Executable file
8
test/make-encoded-markov-chain.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
./generate-markov-corpus.py >/tmp/corpus.txt
|
||||
../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
|
||||
gzip -9 /tmp/markov.json
|
||||
echo -n 'export const encodedMarkovChain = "'
|
||||
base64 /tmp/markov.json.gz | tr -d '\n'
|
||||
echo '";'
|
Reference in New Issue
Block a user