* Update input corruption method to work with lists as well as trings

2025-11-03 01:17:52 +03:00 · 2015-06-05 19:33:32 +02:00 · 2015-06-05 19:33:32 +02:00 · 362f87dc3a
commit 362f87dc3a
parent 33e70b167f
1 changed files with 15 additions and 2 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -24,7 +24,7 @@ from spacy.gold import GoldParse
 from spacy.scorer import Scorer
-def add_noise(c, noise_level):
+def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
@ -37,6 +37,17 @@ def add_noise(c, noise_level):
        return c.lower()
 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return ''.join(_corrupt(c, noise_level) for c in orig)
 def score_model(scorer, nlp, raw_text, annot_tuples):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
@ -109,8 +120,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
-                    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)