spaCy/spacy/munge/read_conll.py

from __future__ import unicode_literals


def split(text):
    return [sent.strip() for sent in text.split('\n\n') if sent.strip()]


def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text
    annot = []
    words = []
    id_map = {}
    for i, line in enumerate(sent_text.split('\n')):
        word, tag, head, dep = line.split()
        id_map[i] = len(words)
        if strip_bad_periods and words and _is_bad_period(words[-1], word):
            continue
  
        annot.append({
            'id': len(words),
            'word': word,
            'tag': tag,
            'head': int(head) - 1,
            'dep': dep})
        words.append(word)
    for entry in annot:
        entry['head'] = id_map.get(entry['head'], entry['head'])
    return words, annot


def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`from __future__ import unicode_literals`


			`def split(text):`
			`return [sent.strip() for sent in text.split('\n\n') if sent.strip()]`


			`def parse(sent_text, strip_bad_periods=False):`
			`sent_text = sent_text.strip()`
			`assert sent_text`
			`annot = []`
			`words = []`
* Fix head misalignment in read_conll, when periods are ignored 2015-05-06 17:30:28 +03:00			`id_map = {}`
			`for i, line in enumerate(sent_text.split('\n')):`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`word, tag, head, dep = line.split()`
* Fix head misalignment in read_conll, when periods are ignored 2015-05-06 17:30:28 +03:00			`id_map[i] = len(words)`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`if strip_bad_periods and words and _is_bad_period(words[-1], word):`
			`continue`

			`annot.append({`
* Fix head misalignment in read_conll, when periods are ignored 2015-05-06 17:30:28 +03:00			`'id': len(words),`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`'word': word,`
			`'tag': tag,`
* Fix head misalignment in read_conll, when periods are ignored 2015-05-06 17:30:28 +03:00			`'head': int(head) - 1,`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`'dep': dep})`
			`words.append(word)`
* Fix head misalignment in read_conll, when periods are ignored 2015-05-06 17:30:28 +03:00			`for entry in annot:`
			`entry['head'] = id_map.get(entry['head'], entry['head'])`
* Write JSON files, with both dependency and PSG parses 2015-05-05 03:31:20 +03:00			`return words, annot`


			`def _is_bad_period(prev, period):`
			`if period != '.':`
			`return False`
			`elif prev == '.':`
			`return False`
			`elif not prev.endswith('.'):`
			`return False`
			`else:`
			`return True`