mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 09:31:59 +03:00
Fix training/ partially.
This commit is contained in:
parent
77e9e4ddad
commit
50dac51dc8
|
@ -589,6 +589,7 @@ def _fix_legacy_dict_data(example_dict):
|
|||
"doc_annotation": doc_dict
|
||||
}
|
||||
|
||||
|
||||
def _has_field(annot, field):
|
||||
if field not in annot:
|
||||
return False
|
||||
|
@ -625,6 +626,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
ent_types.append("")
|
||||
return ent_iobs, ent_types
|
||||
|
||||
|
||||
def _parse_links(vocab, words, spaces, links):
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
starts = {token.idx: token.i for token in reference}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import json
|
||||
import warnings
|
||||
|
||||
import srsly
|
||||
|
@ -6,7 +5,7 @@ import srsly
|
|||
from .. import util
|
||||
from ..errors import Warnings
|
||||
from ..tokens import Doc
|
||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
||||
from .iob_utils import offsets_to_biluo_tags
|
||||
|
||||
|
||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||
|
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
json_doc = {"id": doc_id, "paragraphs": []}
|
||||
for i, doc in enumerate(docs):
|
||||
raw = None if doc.has_unknown_spaces else doc.text
|
||||
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
|
||||
json_para = {
|
||||
'raw': raw,
|
||||
"sentences": [],
|
||||
"cats": [],
|
||||
"entities": [],
|
||||
"links": []
|
||||
}
|
||||
for cat, val in doc.cats.items():
|
||||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
|
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
if ent.kb_id_:
|
||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
biluo_tags = offsets_to_biluo_tags(
|
||||
doc, json_para["entities"], missing=ner_missing_tag
|
||||
)
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||
json_token = {
|
||||
"id": token.i, "orth": token.text, "space": token.whitespace_
|
||||
}
|
||||
if include_annotation["TAG"]:
|
||||
json_token["tag"] = token.tag_
|
||||
if include_annotation["POS"]:
|
||||
|
@ -125,9 +134,14 @@ def json_to_annotations(doc):
|
|||
else:
|
||||
sent_starts.append(-1)
|
||||
if "brackets" in sent:
|
||||
brackets.extend((b["first"] + sent_start_i,
|
||||
b["last"] + sent_start_i, b["label"])
|
||||
for b in sent["brackets"])
|
||||
brackets.extend(
|
||||
(
|
||||
b["first"] + sent_start_i,
|
||||
b["last"] + sent_start_i,
|
||||
b["label"]
|
||||
)
|
||||
for b in sent["brackets"]
|
||||
)
|
||||
|
||||
example["token_annotation"] = dict(
|
||||
ids=ids,
|
||||
|
@ -160,6 +174,7 @@ def json_to_annotations(doc):
|
|||
)
|
||||
yield example
|
||||
|
||||
|
||||
def json_iterate(bytes utf8_str):
|
||||
# We should've made these files jsonl...But since we didn't, parse out
|
||||
# the docs one-by-one to reduce memory usage.
|
||||
|
|
Loading…
Reference in New Issue
Block a user