* Fix head misalignment in read_conll, when periods are ignored

This commit is contained in:
Matthew Honnibal 2015-05-06 16:30:28 +02:00
parent d2ac8d8007
commit 0605af6838

View File

@ -10,20 +10,22 @@ def parse(sent_text, strip_bad_periods=False):
assert sent_text assert sent_text
annot = [] annot = []
words = [] words = []
i = 0 id_map = {}
for line in sent_text.split('\n'): for i, line in enumerate(sent_text.split('\n')):
word, tag, head, dep = line.split() word, tag, head, dep = line.split()
id_map[i] = len(words)
if strip_bad_periods and words and _is_bad_period(words[-1], word): if strip_bad_periods and words and _is_bad_period(words[-1], word):
continue continue
annot.append({ annot.append({
'id': i, 'id': len(words),
'word': word, 'word': word,
'tag': tag, 'tag': tag,
'head': int(head) - 1 if int(head) != 0 else i, 'head': int(head) - 1,
'dep': dep}) 'dep': dep})
words.append(word) words.append(word)
i += 1 for entry in annot:
entry['head'] = id_map.get(entry['head'], entry['head'])
return words, annot return words, annot