* Tmp commit

This commit is contained in:
Matthew Honnibal 2015-05-24 02:50:14 +02:00
parent 20f1d868a3
commit bfeb29ebd1
3 changed files with 43 additions and 41 deletions

View File

@ -26,8 +26,21 @@ from spacy.syntax.conll import GoldParse
from spacy.scorer import Scorer from spacy.scorer import Scorer
def add_noise(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
else:
return c.lower()
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
gold_preproc=False, n_sents=0): gold_preproc=False, n_sents=0, corruption_level=0):
dep_model_dir = path.join(model_dir, 'deps') dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos') pos_model_dir = path.join(model_dir, 'pos')
ner_model_dir = path.join(model_dir, 'ner') ner_model_dir = path.join(model_dir, 'ner')
@ -55,15 +68,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
print "Itn.\tUAS\tNER F.\tTag %\tToken %" print "Itn.\tUAS\tNER F.\tTag %\tToken %"
for itn in range(n_iter): for itn in range(n_iter):
scorer = Scorer() scorer = Scorer()
for raw_text, segmented_text, annot_tuples, ctnt in gold_tuples: for raw_text, annot_tuples, ctnt in gold_tuples:
raw_text = ''.join(add_noise(c, corruption_level) for c in raw_text)
tokens = nlp(raw_text, merge_mwes=False) tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False) scorer.score(tokens, gold, verbose=False)
assert not gold_preproc
if gold_preproc: sents = [nlp.tokenizer(raw_text)]
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
else:
sents = [nlp.tokenizer(raw_text)]
for tokens in sents: for tokens in sents:
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
nlp.tagger(tokens) nlp.tagger(tokens)
@ -90,7 +101,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True)
assert not gold_preproc assert not gold_preproc
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
scorer = Scorer() scorer = Scorer()
for raw_text, segmented_text, annot_tuples, brackets in gold_tuples: for raw_text, annot_tuples, brackets in gold_tuples:
tokens = nlp(raw_text, merge_mwes=False) tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose) scorer.score(tokens, gold, verbose=verbose)
@ -111,7 +122,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
return scorer return scorer
def get_sents(json_dir, section): def get_sents(json_loc):
if path.exists(path.join(json_dir, section + '.json')): if path.exists(path.join(json_dir, section + '.json')):
for sent in read_json_file(path.join(json_dir, section + '.json')): for sent in read_json_file(path.join(json_dir, section + '.json')):
yield sent yield sent
@ -131,21 +142,24 @@ def get_sents(json_dir, section):
@plac.annotations( @plac.annotations(
json_dir=("Annotated JSON files directory",), train_loc=("Location of training json file"),
dev_loc=("Location of development json file"),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
model_dir=("Location of output model directory",), model_dir=("Location of output model directory",),
out_loc=("Out location", "option", "o", str), out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int), n_sents=("Number of training sentences", "option", "n", int),
verbose=("Verbose error reporting", "flag", "v", bool), verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool) debug=("Debug mode", "flag", "d", bool)
) )
def main(json_dir, model_dir, n_sents=0, out_loc="", verbose=False, def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,
debug=False): debug=False, corruption_level=0.0):
train(English, list(get_sents(json_dir, 'train')), model_dir, train(English, read_json_file(train_loc), model_dir,
feat_set='basic' if not debug else 'debug', feat_set='basic' if not debug else 'debug',
gold_preproc=False, n_sents=n_sents) gold_preproc=False, n_sents=n_sents,
corruption_level=corruption_level)
if out_loc: if out_loc:
write_parses(English, dev_loc, model_dir, out_loc) write_parses(English, dev_loc, model_dir, out_loc)
scorer = evaluate(English, list(get_sents(json_dir, 'dev')), scorer = evaluate(English, read_json_file(dev_loc),
model_dir, gold_preproc=False, verbose=verbose) model_dir, gold_preproc=False, verbose=verbose)
print 'TOK', 100-scorer.token_acc print 'TOK', 100-scorer.token_acc
print 'POS', scorer.tags_acc print 'POS', scorer.tags_acc

View File

@ -34,44 +34,30 @@ def _iter_raw_files(raw_loc):
yield f yield f
def _get_word_indices(raw_sent, word_idx, offset):
indices = {}
for piece in raw_sent.split('<SEP>'):
for match in re.finditer(r'\S+', piece):
indices[word_idx] = offset + match.start()
word_idx += 1
offset += len(piece)
return indices, word_idx, offset + 1
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc): def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
ptb_sents = read_ptb.split(open(ptb_loc).read()) ptb_sents = read_ptb.split(open(ptb_loc).read())
dep_sents = read_conll.split(open(dep_loc).read()) dep_sents = read_conll.split(open(dep_loc).read())
assert len(ptb_sents) == len(dep_sents) assert len(ptb_sents) == len(dep_sents)
word_idx = 0
i = 0 i = 0
doc = {'id': filename, 'paragraphs': []} doc = {'id': filename, 'paragraphs': []}
for raw_sents in raw_paras: for raw_sents in raw_paras:
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents), para = {
'segmented': '<SENT>'.join(raw_sents), 'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
'sents': [], 'sents': [],
'tokens': [], 'tokens': [],
'brackets': []} 'brackets': []}
offset = 0 offset = 0
for raw_sent in raw_sents: for raw_sent in raw_sents:
words = raw_sent.replace('<SEP>', ' ').split()
para['sents'].append(offset)
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True) _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset) for token_id, token in enumerate(annot):
for j, token in enumerate(annot):
try: try:
head = indices[token['head']] if token['head'] != -1 else -1 head = (token['head'] + offset) if token['head'] != -1 else -1
para['tokens'].append({ para['tokens'].append({
'start': indices[token['id']], 'id': offset + token_id,
'orth': words[j], 'orth': token['word'],
'tag': token['tag'], 'tag': token['tag'],
'head': head, 'head': head,
'dep': token['dep']}) 'dep': token['dep']})
@ -80,9 +66,11 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
for label, start, end in brackets: for label, start, end in brackets:
if start != end: if start != end:
para['brackets'].append({'label': label, para['brackets'].append({'label': label,
'start': indices[start], 'start': start + offset,
'end': indices[end-1]}) 'end': (end-1) + offset})
i += 1 i += 1
offset += len(annot)
para['sents'].append(offset)
doc['paragraphs'].append(para) doc['paragraphs'].append(para)
return doc return doc

View File

@ -147,7 +147,7 @@ def main(modules, is_pypy):
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans', 'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans',
'spacy.morphology', 'spacy.morphology', 'spacy.munge.alignment',
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state',
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',