mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
* Update input corruption method to work with lists as well as trings
This commit is contained in:
parent
33e70b167f
commit
362f87dc3a
|
@ -24,7 +24,7 @@ from spacy.gold import GoldParse
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
|
||||||
|
|
||||||
def add_noise(c, noise_level):
|
def _corrupt(c, noise_level):
|
||||||
if random.random() >= noise_level:
|
if random.random() >= noise_level:
|
||||||
return c
|
return c
|
||||||
elif c == ' ':
|
elif c == ' ':
|
||||||
|
@ -37,6 +37,17 @@ def add_noise(c, noise_level):
|
||||||
return c.lower()
|
return c.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def add_noise(orig, noise_level):
|
||||||
|
if random.random() >= noise_level:
|
||||||
|
return orig
|
||||||
|
elif type(orig) == list:
|
||||||
|
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||||
|
corrupted = [w for w in corrupted if w]
|
||||||
|
return corrupted
|
||||||
|
else:
|
||||||
|
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
def score_model(scorer, nlp, raw_text, annot_tuples):
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
|
@ -109,8 +120,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
continue
|
continue
|
||||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
score_model(scorer, nlp, raw_text, annot_tuples)
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
words = add_noise(annot_tuples[1], corruption_level)
|
||||||
|
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||||
else:
|
else:
|
||||||
|
raw_text = add_noise(raw_text, corruption_level)
|
||||||
tokens = nlp.tokenizer(raw_text)
|
tokens = nlp.tokenizer(raw_text)
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user