* Clean unused functions from spacy.syntax.conll

This commit is contained in:
Matthew Honnibal 2015-05-24 20:06:46 +02:00
parent 78487f3e66
commit efe7a7d7d6

View File

@ -32,69 +32,6 @@ def read_json_file(loc):
return paragraphs
def read_conll03_file(loc):
sents = []
text = codecs.open(loc, 'r', 'utf8').read().strip()
for doc in text.split('-DOCSTART- -X- O O'):
doc = doc.strip()
if not doc:
continue
for sent_str in doc.split('\n\n'):
words = []
tags = []
iob_ents = []
ids = []
lines = sent_str.strip().split('\n')
idx = 0
for line in lines:
word, tag, chunk, iob = line.split()
if tag == '"':
tag = '``'
if '|' in tag:
tag = tag.split('|')[0]
words.append(word)
tags.append(tag)
iob_ents.append(iob)
ids.append(idx)
idx += len(word) + 1
heads = [-1] * len(words)
labels = ['ROOT'] * len(words)
sents.append((' '.join(words), [words],
(ids, words, tags, heads, labels, _iob_to_biluo(iob_ents))))
return sents
def read_docparse_file(loc):
sents = []
for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
words = []
heads = []
labels = []
tags = []
ids = []
iob_ents = []
lines = sent_str.strip().split('\n')
raw_text = lines.pop(0).strip()
tok_text = lines.pop(0).strip()
for i, line in enumerate(lines):
id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line)
if label == 'root':
label = 'ROOT'
words.append(word)
if head_idx < 0:
head_idx = id_
ids.append(id_)
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
iob_ents.append(iob_ent)
tokenized = [s.replace('<SEP>', ' ').split(' ')
for s in tok_text.split('<SENT>')]
tuples = (ids, words, tags, heads, labels, iob_ents)
sents.append((raw_text, tokenized, tuples, []))
return sents
def _iob_to_biluo(tags):
out = []
curr_label = None
@ -128,20 +65,6 @@ def _consume_ent(tags):
return [start] + middle + [end]
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
else:
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
iob_ent = pieces[5]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent
cdef class GoldParse:
def __init__(self, tokens, annot_tuples, brackets=tuple()):
self.mem = Pool()