mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
* Add read_conll03_file function to conll.pyx
This commit is contained in:
parent
99c9ecfc18
commit
435cccf098
|
@ -4,6 +4,38 @@ import codecs
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
|
|
||||||
|
def read_conll03_file(loc):
|
||||||
|
sents = []
|
||||||
|
text = codecs.open(loc, 'r', 'utf8').read().strip()
|
||||||
|
for doc in text.split('-DOCSTART- -X- O O'):
|
||||||
|
doc = doc.strip()
|
||||||
|
if not doc:
|
||||||
|
continue
|
||||||
|
for sent_str in doc.split('\n\n'):
|
||||||
|
words = []
|
||||||
|
tags = []
|
||||||
|
iob_ents = []
|
||||||
|
ids = []
|
||||||
|
lines = sent_str.strip().split('\n')
|
||||||
|
idx = 0
|
||||||
|
for line in lines:
|
||||||
|
word, tag, chunk, iob = line.split()
|
||||||
|
if tag == '"':
|
||||||
|
tag = '``'
|
||||||
|
if '|' in tag:
|
||||||
|
tag = tag.split('|')[0]
|
||||||
|
words.append(word)
|
||||||
|
tags.append(tag)
|
||||||
|
iob_ents.append(iob)
|
||||||
|
ids.append(idx)
|
||||||
|
idx += len(word) + 1
|
||||||
|
heads = [-1] * len(words)
|
||||||
|
labels = ['ROOT'] * len(words)
|
||||||
|
sents.append((' '.join(words), [words],
|
||||||
|
(ids, words, tags, heads, labels, _iob_to_biluo(iob_ents))))
|
||||||
|
return sents
|
||||||
|
|
||||||
|
|
||||||
def read_docparse_file(loc):
|
def read_docparse_file(loc):
|
||||||
sents = []
|
sents = []
|
||||||
for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
|
for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
|
||||||
|
@ -33,6 +65,40 @@ def read_docparse_file(loc):
|
||||||
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
||||||
return sents
|
return sents
|
||||||
|
|
||||||
|
|
||||||
|
def _iob_to_biluo(tags):
|
||||||
|
out = []
|
||||||
|
curr_label = None
|
||||||
|
tags = list(tags)
|
||||||
|
while tags:
|
||||||
|
out.extend(_consume_os(tags))
|
||||||
|
out.extend(_consume_ent(tags))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _consume_os(tags):
|
||||||
|
while tags and tags[0] == 'O':
|
||||||
|
yield tags.pop(0)
|
||||||
|
|
||||||
|
|
||||||
|
def _consume_ent(tags):
|
||||||
|
if not tags:
|
||||||
|
return []
|
||||||
|
target = tags.pop(0).replace('B', 'I')
|
||||||
|
length = 1
|
||||||
|
while tags and tags[0] == target:
|
||||||
|
length += 1
|
||||||
|
tags.pop(0)
|
||||||
|
label = target[2:]
|
||||||
|
if length == 1:
|
||||||
|
return ['U-' + label]
|
||||||
|
else:
|
||||||
|
start = 'B-' + label
|
||||||
|
end = 'L-' + label
|
||||||
|
middle = ['I-%s' % label for _ in range(1, length - 1)]
|
||||||
|
return [start] + middle + [end]
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
def _parse_line(line):
|
||||||
pieces = line.split()
|
pieces = line.split()
|
||||||
if len(pieces) == 4:
|
if len(pieces) == 4:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user