* Update input corruption method to work with lists as well as trings

This commit is contained in:
Matthew Honnibal 2015-06-05 19:33:32 +02:00
parent 33e70b167f
commit 362f87dc3a

View File

@ -24,7 +24,7 @@ from spacy.gold import GoldParse
from spacy.scorer import Scorer
def add_noise(c, noise_level):
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
@ -37,6 +37,17 @@ def add_noise(c, noise_level):
return c.lower()
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return ''.join(_corrupt(c, noise_level) for c in orig)
def score_model(scorer, nlp, raw_text, annot_tuples):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
@ -109,8 +120,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
continue
score_model(scorer, nlp, raw_text, annot_tuples)
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
words = add_noise(annot_tuples[1], corruption_level)
tokens = nlp.tokenizer.tokens_from_list(words)
else:
raw_text = add_noise(raw_text, corruption_level)
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)