* Add read_json_file to conll.pyx

2025-11-20 09:46:02 +03:00 · 2015-05-06 16:27:31 +02:00 · 2015-05-06 16:27:31 +02:00 · ab67693393
commit ab67693393
parent aff9359a8d
1 changed files with 31 additions and 1 deletions
--- a/spacy/syntax/conll.pyx
+++ b/spacy/syntax/conll.pyx
@ -1,9 +1,38 @@
 import numpy
 import codecs
 import json
 from libc.string cimport memset
 def read_json_file(loc):
    paragraphs = []
    for doc in json.load(open(loc)):
        for paragraph in doc['paragraphs']:
            words = []
            ids = []
            tags = []
            heads = []
            labels = []
            iob_ents = []
            for token in paragraph['tokens']:
                words.append(token['orth'])
                ids.append(token['start'])
                tags.append(token['tag'])
                heads.append(token['head'] if token['head'] >= 1 else token['start'])
                labels.append(token['dep'])
                iob_ents.append(token.get('iob_ent', 'O'))
            brackets = []
            tokenized = [s.replace('<SEP>', ' ').split(' ')
                         for s in paragraph['segmented'].split('<SENT>')]
            paragraphs.append((paragraph['raw'],
                tokenized,
                (ids, words, tags, heads, labels, _iob_to_biluo(iob_ents)),
                brackets))
    return paragraphs
 def read_conll03_file(loc):
    sents = []
    text = codecs.open(loc, 'r', 'utf8').read().strip()
@ -62,7 +91,8 @@ def read_docparse_file(loc):
            iob_ents.append(iob_ent)
        tokenized = [s.replace('<SEP>', ' ').split(' ')
                     for s in tok_text.split('<SENT>')]
-        sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
+        tuples = (ids, words, tags, heads, labels, iob_ents)
        sents.append((raw_text, tokenized, tuples, []))
    return sents