* Update input corruption method to work with lists as well as trings

2025-08-10 15:14:56 +03:00 · 2015-06-05 19:33:32 +02:00 · 2015-06-05 19:33:32 +02:00 · 362f87dc3a
commit 362f87dc3a
parent 33e70b167f
1 changed files with 15 additions and 2 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -24,7 +24,7 @@ from spacy.gold import GoldParse
 from spacy.scorer import Scorer


-def add_noise(c, noise_level):
+def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
@ -37,6 +37,17 @@ def add_noise(c, noise_level):
        return c.lower()


+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return ''.join(_corrupt(c, noise_level) for c in orig)
+
+
 def score_model(scorer, nlp, raw_text, annot_tuples):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
@ -109,8 +120,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
-                    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                    words = add_noise(annot_tuples[1], corruption_level)
+                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
+                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)