* Make very hacky modifications to parser training script, to get Chinese up and running.

2025-10-22 11:44:16 +03:00 · 2016-04-28 14:30:24 +02:00 · 2016-04-28 14:30:24 +02:00 · 588026fe93
commit 588026fe93
parent b1cf2c16c3
1 changed files with 16 additions and 9 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -41,17 +41,21 @@ def _corrupt(c, noise_level):
 def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
+    # TODO
-        return orig
+    return orig.replace(' ', '')
-    elif type(orig) == list:
+    #if random.random() >= noise_level:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
+    #    return orig
-        corrupted = [w for w in corrupted if w]
+    #elif type(orig) == list:
-        return corrupted
+    #    corrupted = [_corrupt(word, noise_level) for word in orig]
-    else:
+    #    corrupted = [w for w in corrupted if w]
-        return ''.join(_corrupt(c, noise_level) for c in orig)
+    #    return corrupted
    #else:
    #    return ''.join(_corrupt(c, noise_level) for c in orig)
 def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    # TODO
    raw_text = raw_text.replace(' ', '')
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
@ -139,7 +143,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
-                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
+                    # TODO
                    nlp.tagger.train(tokens, gold.tags)
                    continue
                    #raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)