mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 15:25:47 +03:00
prevent writing dummy values like deps because that could interfer with sent_start values
This commit is contained in:
parent
0b6d45eae1
commit
e822367cf7
|
@ -72,7 +72,7 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_annotations(locs, limit=0):
|
def read_annotations(locs, limit=0):
|
||||||
""" Yield training examples """
|
""" Yield training examples as example dicts """
|
||||||
i = 0
|
i = 0
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
|
|
|
@ -108,12 +108,18 @@ def json_to_annotations(doc):
|
||||||
words.append(token["orth"])
|
words.append(token["orth"])
|
||||||
spaces.append(token.get("space", True))
|
spaces.append(token.get("space", True))
|
||||||
ids.append(token.get('id', sent_start_i + i))
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
tags.append(token.get('tag', "-"))
|
if "tag" in token:
|
||||||
pos.append(token.get("pos", ""))
|
tags.append(token["tag"])
|
||||||
morphs.append(token.get("morph", ""))
|
if "pos" in token:
|
||||||
lemmas.append(token.get("lemma", ""))
|
pos.append(token["pos"])
|
||||||
heads.append(token.get("head", 0) + sent_start_i + i)
|
if "morph" in token:
|
||||||
labels.append(token.get("dep", ""))
|
morphs.append(token["morph"])
|
||||||
|
if "lemma" in token:
|
||||||
|
lemmas.append(token["lemma"])
|
||||||
|
if "head" in token:
|
||||||
|
heads.append(token["head"])
|
||||||
|
if "dep" in token:
|
||||||
|
labels.append(token["dep"])
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
|
@ -130,15 +136,24 @@ def json_to_annotations(doc):
|
||||||
ids=ids,
|
ids=ids,
|
||||||
words=words,
|
words=words,
|
||||||
spaces=spaces,
|
spaces=spaces,
|
||||||
tags=tags,
|
|
||||||
pos=pos,
|
|
||||||
morphs=morphs,
|
|
||||||
lemmas=lemmas,
|
|
||||||
heads=heads,
|
|
||||||
deps=labels,
|
|
||||||
sent_starts=sent_starts,
|
sent_starts=sent_starts,
|
||||||
brackets=brackets
|
brackets=brackets
|
||||||
)
|
)
|
||||||
|
# avoid including dummy values that looks like gold info was present
|
||||||
|
if tags:
|
||||||
|
example["token_annotation"]["tags"] = tags
|
||||||
|
if pos:
|
||||||
|
example["token_annotation"]["pos"] = pos
|
||||||
|
if morphs:
|
||||||
|
example["token_annotation"]["morphs"] = morphs
|
||||||
|
if lemmas:
|
||||||
|
example["token_annotation"]["lemmas"] = lemmas
|
||||||
|
if heads:
|
||||||
|
example["token_annotation"]["heads"] = heads
|
||||||
|
if labels:
|
||||||
|
example["token_annotation"]["deps"] = labels
|
||||||
|
if pos:
|
||||||
|
example["token_annotation"]["pos"] = pos
|
||||||
|
|
||||||
cats = {}
|
cats = {}
|
||||||
for cat in paragraph.get("cats", {}):
|
for cat in paragraph.get("cats", {}):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user