mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Add read_json_file to conll.pyx
This commit is contained in:
parent
aff9359a8d
commit
ab67693393
|
@ -1,9 +1,38 @@
|
|||
import numpy
|
||||
import codecs
|
||||
import json
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
|
||||
def read_json_file(loc):
|
||||
paragraphs = []
|
||||
for doc in json.load(open(loc)):
|
||||
for paragraph in doc['paragraphs']:
|
||||
words = []
|
||||
ids = []
|
||||
tags = []
|
||||
heads = []
|
||||
labels = []
|
||||
iob_ents = []
|
||||
for token in paragraph['tokens']:
|
||||
words.append(token['orth'])
|
||||
ids.append(token['start'])
|
||||
tags.append(token['tag'])
|
||||
heads.append(token['head'] if token['head'] >= 1 else token['start'])
|
||||
labels.append(token['dep'])
|
||||
iob_ents.append(token.get('iob_ent', 'O'))
|
||||
|
||||
brackets = []
|
||||
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||
for s in paragraph['segmented'].split('<SENT>')]
|
||||
paragraphs.append((paragraph['raw'],
|
||||
tokenized,
|
||||
(ids, words, tags, heads, labels, _iob_to_biluo(iob_ents)),
|
||||
brackets))
|
||||
return paragraphs
|
||||
|
||||
|
||||
def read_conll03_file(loc):
|
||||
sents = []
|
||||
text = codecs.open(loc, 'r', 'utf8').read().strip()
|
||||
|
@ -62,7 +91,8 @@ def read_docparse_file(loc):
|
|||
iob_ents.append(iob_ent)
|
||||
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||
for s in tok_text.split('<SENT>')]
|
||||
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
||||
tuples = (ids, words, tags, heads, labels, iob_ents)
|
||||
sents.append((raw_text, tokenized, tuples, []))
|
||||
return sents
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user