* Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers.

This commit is contained in:
Matthew Honnibal 2015-06-18 16:35:27 +02:00
parent f868175e43
commit 60d26243e3

View File

@ -10,11 +10,12 @@ def parse(sent_text, strip_bad_periods=False):
assert sent_text assert sent_text
annot = [] annot = []
words = [] words = []
id_map = {} id_map = {-1: -1}
for i, line in enumerate(sent_text.split('\n')): for i, line in enumerate(sent_text.split('\n')):
word, tag, head, dep = _parse_line(line) word, tag, head, dep = _parse_line(line)
if strip_bad_periods and words and _is_bad_period(words[-1], word): if strip_bad_periods and words and _is_bad_period(words[-1], word):
continue continue
id_map[i] = len(words)
annot.append({ annot.append({
'id': len(words), 'id': len(words),
@ -23,6 +24,8 @@ def parse(sent_text, strip_bad_periods=False):
'head': int(head) - 1, 'head': int(head) - 1,
'dep': dep}) 'dep': dep})
words.append(word) words.append(word)
for entry in annot:
entry['head'] = id_map[entry['head']]
return words, annot return words, annot