2015-05-05 03:31:20 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
|
|
def split(text):
|
|
|
|
return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
def parse(sent_text, strip_bad_periods=False):
|
|
|
|
sent_text = sent_text.strip()
|
|
|
|
assert sent_text
|
|
|
|
annot = []
|
|
|
|
words = []
|
2015-06-18 17:35:27 +03:00
|
|
|
id_map = {-1: -1}
|
2015-05-06 17:30:28 +03:00
|
|
|
for i, line in enumerate(sent_text.split('\n')):
|
2015-05-24 03:49:56 +03:00
|
|
|
word, tag, head, dep = _parse_line(line)
|
2015-05-05 03:31:20 +03:00
|
|
|
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
|
|
|
continue
|
2015-06-18 17:35:27 +03:00
|
|
|
id_map[i] = len(words)
|
2015-05-05 03:31:20 +03:00
|
|
|
|
|
|
|
annot.append({
|
2015-05-06 17:30:28 +03:00
|
|
|
'id': len(words),
|
2015-05-05 03:31:20 +03:00
|
|
|
'word': word,
|
|
|
|
'tag': tag,
|
2015-05-06 17:30:28 +03:00
|
|
|
'head': int(head) - 1,
|
2015-05-05 03:31:20 +03:00
|
|
|
'dep': dep})
|
|
|
|
words.append(word)
|
2015-06-18 17:35:27 +03:00
|
|
|
for entry in annot:
|
|
|
|
entry['head'] = id_map[entry['head']]
|
2015-05-05 03:31:20 +03:00
|
|
|
return words, annot
|
|
|
|
|
|
|
|
|
|
|
|
def _is_bad_period(prev, period):
|
|
|
|
if period != '.':
|
|
|
|
return False
|
|
|
|
elif prev == '.':
|
|
|
|
return False
|
|
|
|
elif not prev.endswith('.'):
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2015-05-24 03:49:56 +03:00
|
|
|
def _parse_line(line):
|
|
|
|
pieces = line.split()
|
|
|
|
if len(pieces) == 4:
|
|
|
|
return pieces
|
|
|
|
else:
|
|
|
|
return pieces[1], pieces[3], pieces[5], pieces[6]
|
|
|
|
|