mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Fix head misalignment in read_conll, when periods are ignored
This commit is contained in:
parent
d2ac8d8007
commit
0605af6838
|
@ -10,20 +10,22 @@ def parse(sent_text, strip_bad_periods=False):
|
||||||
assert sent_text
|
assert sent_text
|
||||||
annot = []
|
annot = []
|
||||||
words = []
|
words = []
|
||||||
i = 0
|
id_map = {}
|
||||||
for line in sent_text.split('\n'):
|
for i, line in enumerate(sent_text.split('\n')):
|
||||||
word, tag, head, dep = line.split()
|
word, tag, head, dep = line.split()
|
||||||
|
id_map[i] = len(words)
|
||||||
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
annot.append({
|
annot.append({
|
||||||
'id': i,
|
'id': len(words),
|
||||||
'word': word,
|
'word': word,
|
||||||
'tag': tag,
|
'tag': tag,
|
||||||
'head': int(head) - 1 if int(head) != 0 else i,
|
'head': int(head) - 1,
|
||||||
'dep': dep})
|
'dep': dep})
|
||||||
words.append(word)
|
words.append(word)
|
||||||
i += 1
|
for entry in annot:
|
||||||
|
entry['head'] = id_map.get(entry['head'], entry['head'])
|
||||||
return words, annot
|
return words, annot
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user