* Fix alignment in prepare_treebank

This commit is contained in:
Matthew Honnibal 2015-05-06 16:31:00 +02:00
parent 0605af6838
commit e0ef6b6992

View File

@ -16,6 +16,8 @@ doc: {
end: int, end: int,
label: string, label: string,
flabel: int}]}]} flabel: int}]}]}
Consumes output of spacy/munge/align_raw.py
""" """
import plac import plac
import json import json
@ -39,7 +41,7 @@ def _get_word_indices(raw_sent, word_idx, offset):
indices[word_idx] = offset + match.start() indices[word_idx] = offset + match.start()
word_idx += 1 word_idx += 1
offset += len(piece) offset += len(piece)
return indices, word_idx, offset return indices, word_idx, offset + 1
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc): def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
@ -49,25 +51,27 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
assert len(ptb_sents) == len(dep_sents) assert len(ptb_sents) == len(dep_sents)
word_idx = 0 word_idx = 0
offset = 0
i = 0 i = 0
doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []} doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []}
for raw_sents in raw_paras: for raw_sents in raw_paras:
para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents), para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
'segmented': '<PARA>'.join(raw_sents), 'segmented': '<SENT>'.join(raw_sents),
'sents': [], 'sents': [],
'tokens': [], 'tokens': [],
'brackets': []} 'brackets': []}
offset = 0
for raw_sent in raw_sents: for raw_sent in raw_sents:
words = raw_sent.replace('<SEP>', ' ').split()
para['sents'].append(offset) para['sents'].append(offset)
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True) _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset) indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
for j, token in enumerate(annot):
for token in annot: head = indices[token['head']] if token['head'] != -1 else -1
head = indices[token['head']]
try: try:
para['tokens'].append({'start': indices[token['id']], para['tokens'].append({
'start': indices[token['id']],
'orth': words[j],
'tag': token['tag'], 'tag': token['tag'],
'head': head, 'head': head,
'dep': token['dep']}) 'dep': token['dep']})