* Make very hacky modifications to parser training script, to get Chinese up and running.

This commit is contained in:
Matthew Honnibal 2016-04-28 14:30:24 +02:00
parent b1cf2c16c3
commit 588026fe93

View File

@ -41,17 +41,21 @@ def _corrupt(c, noise_level):
def add_noise(orig, noise_level): def add_noise(orig, noise_level):
if random.random() >= noise_level: # TODO
return orig return orig.replace(' ', '')
elif type(orig) == list: #if random.random() >= noise_level:
corrupted = [_corrupt(word, noise_level) for word in orig] # return orig
corrupted = [w for w in corrupted if w] #elif type(orig) == list:
return corrupted # corrupted = [_corrupt(word, noise_level) for word in orig]
else: # corrupted = [w for w in corrupted if w]
return ''.join(_corrupt(c, noise_level) for c in orig) # return corrupted
#else:
# return ''.join(_corrupt(c, noise_level) for c in orig)
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
# TODO
raw_text = raw_text.replace(' ', '')
if raw_text is None: if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
else: else:
@ -139,7 +143,10 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp.tagger(tokens) nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
if not gold.is_projective: if not gold.is_projective:
raise Exception("Non-projective sentence in training: %s" % annot_tuples[1]) # TODO
nlp.tagger.train(tokens, gold.tags)
continue
#raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
loss += nlp.parser.train(tokens, gold) loss += nlp.parser.train(tokens, gold)
nlp.entity.train(tokens, gold) nlp.entity.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags) nlp.tagger.train(tokens, gold.tags)