mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
* Add read_json_file to conll.pyx
This commit is contained in:
parent
aff9359a8d
commit
ab67693393
|
@ -1,9 +1,38 @@
|
||||||
import numpy
|
import numpy
|
||||||
import codecs
|
import codecs
|
||||||
|
import json
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_file(loc):
|
||||||
|
paragraphs = []
|
||||||
|
for doc in json.load(open(loc)):
|
||||||
|
for paragraph in doc['paragraphs']:
|
||||||
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
iob_ents = []
|
||||||
|
for token in paragraph['tokens']:
|
||||||
|
words.append(token['orth'])
|
||||||
|
ids.append(token['start'])
|
||||||
|
tags.append(token['tag'])
|
||||||
|
heads.append(token['head'] if token['head'] >= 1 else token['start'])
|
||||||
|
labels.append(token['dep'])
|
||||||
|
iob_ents.append(token.get('iob_ent', 'O'))
|
||||||
|
|
||||||
|
brackets = []
|
||||||
|
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||||
|
for s in paragraph['segmented'].split('<SENT>')]
|
||||||
|
paragraphs.append((paragraph['raw'],
|
||||||
|
tokenized,
|
||||||
|
(ids, words, tags, heads, labels, _iob_to_biluo(iob_ents)),
|
||||||
|
brackets))
|
||||||
|
return paragraphs
|
||||||
|
|
||||||
|
|
||||||
def read_conll03_file(loc):
|
def read_conll03_file(loc):
|
||||||
sents = []
|
sents = []
|
||||||
text = codecs.open(loc, 'r', 'utf8').read().strip()
|
text = codecs.open(loc, 'r', 'utf8').read().strip()
|
||||||
|
@ -62,7 +91,8 @@ def read_docparse_file(loc):
|
||||||
iob_ents.append(iob_ent)
|
iob_ents.append(iob_ent)
|
||||||
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||||
for s in tok_text.split('<SENT>')]
|
for s in tok_text.split('<SENT>')]
|
||||||
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
tuples = (ids, words, tags, heads, labels, iob_ents)
|
||||||
|
sents.append((raw_text, tokenized, tuples, []))
|
||||||
return sents
|
return sents
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user