mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Fix docs2jsonl function
This commit is contained in:
parent
ea2edd1e2c
commit
a9fb6d5511
|
@ -11,6 +11,7 @@ import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import msgpack
|
import msgpack
|
||||||
|
import json
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
|
@ -347,7 +348,11 @@ def _json_iterate(loc):
|
||||||
curly_depth -= 1
|
curly_depth -= 1
|
||||||
if square_depth == 1 and curly_depth == 0:
|
if square_depth == 1 and curly_depth == 0:
|
||||||
py_str = py_raw[start : i+1].decode('utf8')
|
py_str = py_raw[start : i+1].decode('utf8')
|
||||||
yield ujson.loads(py_str)
|
try:
|
||||||
|
yield json.loads(py_str)
|
||||||
|
except:
|
||||||
|
print(py_str)
|
||||||
|
raise
|
||||||
start = -1
|
start = -1
|
||||||
|
|
||||||
|
|
||||||
|
@ -579,9 +584,11 @@ def docs_to_json(id, docs):
|
||||||
json_sent = {'tokens': [], 'brackets': []}
|
json_sent = {'tokens': [], 'brackets': []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
json_token = {"id": token.i, "orth": token.text}
|
json_token = {"id": token.i, "orth": token.text}
|
||||||
json_token['tag'] = token.tag_ if doc.is_tagged else None
|
if doc.is_tagged:
|
||||||
json_token['head'] = (token.head.i-token.i) if doc.is_parsed else None
|
json_token['tag'] = token.tag_
|
||||||
json_token['dep'] = token.dep_ if doc.is_parsed else None
|
if doc.is_parsed:
|
||||||
|
json_token['head'] = token.head.i-token.i
|
||||||
|
json_token['dep'] = token.dep_
|
||||||
json_token['ner'] = biluo_tags[token.i]
|
json_token['ner'] = biluo_tags[token.i]
|
||||||
json_sent['tokens'].append(json_token)
|
json_sent['tokens'].append(json_token)
|
||||||
json_para['sentences'].append(json_sent)
|
json_para['sentences'].append(json_sent)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user