mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 14:40:34 +03:00
084271c9e9
* Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import
148 lines
4.4 KiB
Python
148 lines
4.4 KiB
Python
class TokenAnnotation:
|
|
def __init__(
|
|
self,
|
|
ids=None,
|
|
words=None,
|
|
tags=None,
|
|
pos=None,
|
|
morphs=None,
|
|
lemmas=None,
|
|
heads=None,
|
|
deps=None,
|
|
entities=None,
|
|
sent_starts=None,
|
|
brackets=None,
|
|
):
|
|
self.ids = ids if ids else []
|
|
self.words = words if words else []
|
|
self.tags = tags if tags else []
|
|
self.pos = pos if pos else []
|
|
self.morphs = morphs if morphs else []
|
|
self.lemmas = lemmas if lemmas else []
|
|
self.heads = heads if heads else []
|
|
self.deps = deps if deps else []
|
|
self.entities = entities if entities else []
|
|
self.sent_starts = sent_starts if sent_starts else []
|
|
self.brackets_by_start = {}
|
|
if brackets:
|
|
for b_start, b_end, b_label in brackets:
|
|
self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
|
|
|
|
def get_field(self, field):
|
|
if field == "id":
|
|
return self.ids
|
|
elif field == "word":
|
|
return self.words
|
|
elif field == "tag":
|
|
return self.tags
|
|
elif field == "pos":
|
|
return self.pos
|
|
elif field == "morph":
|
|
return self.morphs
|
|
elif field == "lemma":
|
|
return self.lemmas
|
|
elif field == "head":
|
|
return self.heads
|
|
elif field == "dep":
|
|
return self.deps
|
|
elif field == "ner":
|
|
return self.entities
|
|
elif field == "sent_start":
|
|
return self.sent_starts
|
|
else:
|
|
raise ValueError(f"Unknown field: {field}")
|
|
|
|
@property
|
|
def brackets(self):
|
|
brackets = []
|
|
for start, ends_labels in self.brackets_by_start.items():
|
|
for end, label in ends_labels:
|
|
brackets.append((start, end, label))
|
|
return brackets
|
|
|
|
@classmethod
|
|
def from_dict(cls, token_dict):
|
|
return cls(
|
|
ids=token_dict.get("ids", None),
|
|
words=token_dict.get("words", None),
|
|
tags=token_dict.get("tags", None),
|
|
pos=token_dict.get("pos", None),
|
|
morphs=token_dict.get("morphs", None),
|
|
lemmas=token_dict.get("lemmas", None),
|
|
heads=token_dict.get("heads", None),
|
|
deps=token_dict.get("deps", None),
|
|
entities=token_dict.get("entities", None),
|
|
sent_starts=token_dict.get("sent_starts", None),
|
|
brackets=token_dict.get("brackets", None),
|
|
)
|
|
|
|
def to_dict(self):
|
|
return {
|
|
"ids": self.ids,
|
|
"words": self.words,
|
|
"tags": self.tags,
|
|
"pos": self.pos,
|
|
"morphs": self.morphs,
|
|
"lemmas": self.lemmas,
|
|
"heads": self.heads,
|
|
"deps": self.deps,
|
|
"entities": self.entities,
|
|
"sent_starts": self.sent_starts,
|
|
"brackets": self.brackets,
|
|
}
|
|
|
|
def get_id(self, i):
|
|
return self.ids[i] if i < len(self.ids) else i
|
|
|
|
def get_word(self, i):
|
|
return self.words[i] if i < len(self.words) else ""
|
|
|
|
def get_tag(self, i):
|
|
return self.tags[i] if i < len(self.tags) else "-"
|
|
|
|
def get_pos(self, i):
|
|
return self.pos[i] if i < len(self.pos) else ""
|
|
|
|
def get_morph(self, i):
|
|
return self.morphs[i] if i < len(self.morphs) else ""
|
|
|
|
def get_lemma(self, i):
|
|
return self.lemmas[i] if i < len(self.lemmas) else ""
|
|
|
|
def get_head(self, i):
|
|
return self.heads[i] if i < len(self.heads) else i
|
|
|
|
def get_dep(self, i):
|
|
return self.deps[i] if i < len(self.deps) else ""
|
|
|
|
def get_entity(self, i):
|
|
return self.entities[i] if i < len(self.entities) else "-"
|
|
|
|
def get_sent_start(self, i):
|
|
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
|
|
|
def __str__(self):
|
|
return str(self.to_dict())
|
|
|
|
def __repr__(self):
|
|
return self.__str__()
|
|
|
|
|
|
class DocAnnotation:
|
|
def __init__(self, cats=None, links=None):
|
|
self.cats = cats if cats else {}
|
|
self.links = links if links else {}
|
|
|
|
@classmethod
|
|
def from_dict(cls, doc_dict):
|
|
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
|
|
|
|
def to_dict(self):
|
|
return {"cats": self.cats, "links": self.links}
|
|
|
|
def __str__(self):
|
|
return str(self.to_dict())
|
|
|
|
def __repr__(self):
|
|
return self.__str__()
|