mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Restructure Example with merged sents as default (#4632)
* Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Restructure Example with merged sents as default An `Example` now includes a single `TokenAnnotation` that includes all the information from one `Doc` (=JSON `paragraph`). If required, the individual sentences can be returned as a list of examples with `Example.split_sents()` with no raw text available. * Input/output a single `Example.token_annotation` * Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries * Replace `Example.merge_sents()` with `Example.split_sents()` * Modify components to use a single `Example.token_annotation` * Pipeline components * conllu2json converter * Rework/rename `add_token_annotation()` and `add_doc_annotation()` to `set_token_annotation()` and `set_doc_annotation()`, functions that set rather then appending/extending. * Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse` * Add getters to `TokenAnnotation` to supply default values when a given attribute is not available * `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only applied on single examples, so the `GoldParse` is returned saved in the provided `Example` rather than creating a new `Example` with no other internal annotation * Update tests for API changes and `merge_sents()` vs. `split_sents()` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Fix make_orth_variants() Fix bug in make_orth_variants() related to conversion from multiple to one TokenAnnotation per Example. * Add basic test for make_orth_variants() * Replace try/except with conditionals * Replace default morph value with set
This commit is contained in:
parent
44829950ba
commit
392c4880d9
|
@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||||
checked_for_ner = False
|
checked_for_ner = False
|
||||||
has_ner_tags = False
|
has_ner_tags = False
|
||||||
for i, example in enumerate(conll_data):
|
for i, example in enumerate(conll_data):
|
||||||
for token_annotation in example.token_annotations:
|
if not checked_for_ner:
|
||||||
if not checked_for_ner:
|
has_ner_tags = is_ner(example.token_annotation.entities[0])
|
||||||
has_ner_tags = is_ner(token_annotation.entities[0])
|
checked_for_ner = True
|
||||||
checked_for_ner = True
|
sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
|
||||||
sentences.append(generate_sentence(token_annotation, has_ner_tags))
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# conllu document
|
||||||
# conluu document
|
if len(sentences) % n_sents == 0:
|
||||||
if len(sentences) % n_sents == 0:
|
doc = create_doc(sentences, i)
|
||||||
doc = create_doc(sentences, i)
|
docs.append(doc)
|
||||||
docs.append(doc)
|
sentences = []
|
||||||
sentences = []
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
example = Example(doc=None)
|
example = Example(doc=None)
|
||||||
example.add_token_annotation(ids=ids, words=words, tags=tags,
|
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
||||||
heads=heads, deps=deps, entities=ents)
|
heads=heads, deps=deps, entities=ents)
|
||||||
yield example
|
yield example
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -25,7 +25,7 @@ cdef class GoldParse:
|
||||||
cdef public int loss
|
cdef public int loss
|
||||||
cdef public list words
|
cdef public list words
|
||||||
cdef public list tags
|
cdef public list tags
|
||||||
cdef public list morphology
|
cdef public list morphs
|
||||||
cdef public list heads
|
cdef public list heads
|
||||||
cdef public list labels
|
cdef public list labels
|
||||||
cdef public dict orths
|
cdef public dict orths
|
||||||
|
@ -45,7 +45,8 @@ cdef class TokenAnnotation:
|
||||||
cdef public list heads
|
cdef public list heads
|
||||||
cdef public list deps
|
cdef public list deps
|
||||||
cdef public list entities
|
cdef public list entities
|
||||||
cdef public list morphology
|
cdef public list morphs
|
||||||
|
cdef public list sent_starts
|
||||||
cdef public list brackets
|
cdef public list brackets
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,7 +57,7 @@ cdef class DocAnnotation:
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
cdef public object doc
|
cdef public object doc
|
||||||
cdef public list token_annotations
|
cdef public TokenAnnotation token_annotation
|
||||||
cdef public DocAnnotation doc_annotation
|
cdef public DocAnnotation doc_annotation
|
||||||
cdef public object goldparse
|
cdef public object goldparse
|
||||||
|
|
||||||
|
|
433
spacy/gold.pyx
433
spacy/gold.pyx
|
@ -215,7 +215,7 @@ class GoldCorpus(object):
|
||||||
ex_dict = example.to_dict()
|
ex_dict = example.to_dict()
|
||||||
text = example.text
|
text = example.text
|
||||||
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
|
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
|
||||||
n += len(example.token_annotations)
|
n += 1
|
||||||
if limit and n >= limit:
|
if limit and n >= limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -271,7 +271,7 @@ class GoldCorpus(object):
|
||||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||||
for example in examples:
|
for example in examples:
|
||||||
yield example
|
yield example
|
||||||
i += len(example.token_annotations)
|
i += 1
|
||||||
if limit and i >= limit:
|
if limit and i >= limit:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -286,15 +286,14 @@ class GoldCorpus(object):
|
||||||
yield from self.read_examples(locs, limit=self.limit)
|
yield from self.read_examples(locs, limit=self.limit)
|
||||||
|
|
||||||
def count_train(self):
|
def count_train(self):
|
||||||
# TODO: should this count words or sentences ?
|
"""Returns count of words in train examples"""
|
||||||
n = 0
|
n = 0
|
||||||
i = 0
|
i = 0
|
||||||
for example in self.train_examples:
|
for example in self.train_examples:
|
||||||
for token_annotation in example.token_annotations:
|
n += len(example.token_annotation.words)
|
||||||
n += len(token_annotation.words)
|
if self.limit and i >= self.limit:
|
||||||
if self.limit and i >= self.limit:
|
break
|
||||||
break
|
i += 1
|
||||||
i += 1
|
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
|
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
|
||||||
|
@ -328,18 +327,27 @@ class GoldCorpus(object):
|
||||||
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
||||||
noise_level=0.0, orth_variant_level=0.0,
|
noise_level=0.0, orth_variant_level=0.0,
|
||||||
make_projective=False, ignore_misaligned=False):
|
make_projective=False, ignore_misaligned=False):
|
||||||
""" Setting gold_preproc will result in creating a doc per 'sentence' """
|
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||||
for example in examples:
|
for example in examples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
example.doc = None
|
example.doc = None
|
||||||
|
split_examples = example.split_sents()
|
||||||
|
example_golds = []
|
||||||
|
for split_example in split_examples:
|
||||||
|
split_example_docs = cls._make_docs(nlp, split_example,
|
||||||
|
gold_preproc, noise_level=noise_level,
|
||||||
|
orth_variant_level=orth_variant_level)
|
||||||
|
split_example_golds = cls._make_golds(split_example_docs,
|
||||||
|
vocab=nlp.vocab, make_projective=make_projective,
|
||||||
|
ignore_misaligned=ignore_misaligned)
|
||||||
|
example_golds.extend(split_example_golds)
|
||||||
else:
|
else:
|
||||||
example = example.merge_sents()
|
example_docs = cls._make_docs(nlp, example,
|
||||||
example_docs = cls._make_docs(nlp, example,
|
gold_preproc, noise_level=noise_level,
|
||||||
gold_preproc, noise_level=noise_level,
|
orth_variant_level=orth_variant_level)
|
||||||
orth_variant_level=orth_variant_level)
|
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
|
||||||
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
|
make_projective=make_projective,
|
||||||
make_projective=make_projective,
|
ignore_misaligned=ignore_misaligned)
|
||||||
ignore_misaligned=ignore_misaligned)
|
|
||||||
for ex in example_golds:
|
for ex in example_golds:
|
||||||
if ex.goldparse is not None:
|
if ex.goldparse is not None:
|
||||||
if (not max_length) or len(ex.doc) < max_length:
|
if (not max_length) or len(ex.doc) < max_length:
|
||||||
|
@ -353,35 +361,28 @@ class GoldCorpus(object):
|
||||||
var_text = add_noise(var_example.text, noise_level)
|
var_text = add_noise(var_example.text, noise_level)
|
||||||
var_doc = nlp.make_doc(var_text)
|
var_doc = nlp.make_doc(var_text)
|
||||||
var_example.doc = var_doc
|
var_example.doc = var_doc
|
||||||
return [var_example]
|
|
||||||
else:
|
else:
|
||||||
doc_examples = []
|
var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
|
||||||
for token_annotation in var_example.token_annotations:
|
var_example.doc = var_doc
|
||||||
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
|
return [var_example]
|
||||||
doc_example = Example(doc_annotation=example.doc_annotation,
|
|
||||||
token_annotations=[token_annotation],
|
|
||||||
doc=t_doc)
|
|
||||||
doc_examples.append(doc_example)
|
|
||||||
return doc_examples
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, examples, vocab=None, make_projective=False,
|
def _make_golds(cls, examples, vocab=None, make_projective=False,
|
||||||
ignore_misaligned=False):
|
ignore_misaligned=False):
|
||||||
gold_examples = []
|
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_parses = example.get_gold_parses(vocab=vocab,
|
gold_parses = example.get_gold_parses(vocab=vocab,
|
||||||
make_projective=make_projective,
|
make_projective=make_projective,
|
||||||
ignore_misaligned=ignore_misaligned)
|
ignore_misaligned=ignore_misaligned)
|
||||||
for (doc, gold) in gold_parses:
|
assert len(gold_parses) == 1
|
||||||
ex = Example(doc=doc)
|
assert gold_parses[0][0] == example.doc
|
||||||
ex.goldparse = gold
|
example.goldparse = gold_parses[0][1]
|
||||||
gold_examples.append(ex)
|
return examples
|
||||||
return gold_examples
|
|
||||||
|
|
||||||
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return example
|
return example
|
||||||
if not example.token_annotations:
|
if not example.token_annotation:
|
||||||
return example
|
return example
|
||||||
raw = example.text
|
raw = example.text
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
|
@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
ndpv = nlp.Defaults.paired_orth_variants
|
ndpv = nlp.Defaults.paired_orth_variants
|
||||||
# modify words in paragraph_tuples
|
# modify words in paragraph_tuples
|
||||||
variant_example = Example(doc=raw)
|
variant_example = Example(doc=raw)
|
||||||
for token_annotation in example.token_annotations:
|
token_annotation = example.token_annotation
|
||||||
words = token_annotation.words
|
words = token_annotation.words
|
||||||
tags = token_annotation.tags
|
tags = token_annotation.tags
|
||||||
if not words or not tags:
|
if not words or not tags:
|
||||||
# add the unmodified annotation
|
# add the unmodified annotation
|
||||||
token_dict = token_annotation.to_dict()
|
token_dict = token_annotation.to_dict()
|
||||||
variant_example.add_token_annotation(**token_dict)
|
variant_example.set_token_annotation(**token_dict)
|
||||||
else:
|
else:
|
||||||
if lower:
|
if lower:
|
||||||
words = [w.lower() for w in words]
|
words = [w.lower() for w in words]
|
||||||
# single variants
|
# single variants
|
||||||
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||||
for word_idx in range(len(words)):
|
for word_idx in range(len(words)):
|
||||||
for punct_idx in range(len(ndsv)):
|
for punct_idx in range(len(ndsv)):
|
||||||
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
||||||
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
||||||
words[word_idx] = punct_choices[punct_idx]
|
words[word_idx] = punct_choices[punct_idx]
|
||||||
# paired variants
|
# paired variants
|
||||||
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||||
for word_idx in range(len(words)):
|
for word_idx in range(len(words)):
|
||||||
for punct_idx in range(len(ndpv)):
|
for punct_idx in range(len(ndpv)):
|
||||||
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
||||||
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||||
# backup option: random left vs. right from pair
|
# backup option: random left vs. right from pair
|
||||||
pair_idx = random.choice([0, 1])
|
pair_idx = random.choice([0, 1])
|
||||||
# best option: rely on paired POS tags like `` / ''
|
# best option: rely on paired POS tags like `` / ''
|
||||||
if len(ndpv[punct_idx]["tags"]) == 2:
|
if len(ndpv[punct_idx]["tags"]) == 2:
|
||||||
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
||||||
# next best option: rely on position in variants
|
# next best option: rely on position in variants
|
||||||
# (may not be unambiguous, so order of variants matters)
|
# (may not be unambiguous, so order of variants matters)
|
||||||
else:
|
else:
|
||||||
for pair in ndpv[punct_idx]["variants"]:
|
for pair in ndpv[punct_idx]["variants"]:
|
||||||
if words[word_idx] in pair:
|
if words[word_idx] in pair:
|
||||||
pair_idx = pair.index(words[word_idx])
|
pair_idx = pair.index(words[word_idx])
|
||||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
|
|
||||||
token_dict = token_annotation.to_dict()
|
token_dict = token_annotation.to_dict()
|
||||||
token_dict["words"] = words
|
token_dict["words"] = words
|
||||||
token_dict["tags"] = tags
|
token_dict["tags"] = tags
|
||||||
variant_example.add_token_annotation(**token_dict)
|
variant_example.set_token_annotation(**token_dict)
|
||||||
# modify raw to match variant_paragraph_tuples
|
# modify raw to match variant_paragraph_tuples
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
variants = []
|
variants = []
|
||||||
|
@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
for token_annotation in variant_example.token_annotations:
|
for word in variant_example.token_annotation.words:
|
||||||
for word in token_annotation.words:
|
match_found = False
|
||||||
match_found = False
|
# add identical word
|
||||||
# add identical word
|
if word not in variants and raw[raw_idx:].startswith(word):
|
||||||
if word not in variants and raw[raw_idx:].startswith(word):
|
variant_raw += word
|
||||||
variant_raw += word
|
raw_idx += len(word)
|
||||||
raw_idx += len(word)
|
match_found = True
|
||||||
match_found = True
|
# add variant word
|
||||||
# add variant word
|
else:
|
||||||
else:
|
for variant in variants:
|
||||||
for variant in variants:
|
if not match_found and \
|
||||||
if not match_found and \
|
raw[raw_idx:].startswith(variant):
|
||||||
raw[raw_idx:].startswith(variant):
|
raw_idx += len(variant)
|
||||||
raw_idx += len(variant)
|
variant_raw += word
|
||||||
variant_raw += word
|
match_found = True
|
||||||
match_found = True
|
# something went wrong, abort
|
||||||
# something went wrong, abort
|
# (add a warning message?)
|
||||||
# (add a warning message?)
|
if not match_found:
|
||||||
if not match_found:
|
return example
|
||||||
return example
|
# add following whitespace
|
||||||
# add following whitespace
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
variant_raw += raw[raw_idx]
|
||||||
variant_raw += raw[raw_idx]
|
raw_idx += 1
|
||||||
raw_idx += 1
|
|
||||||
variant_example.doc = variant_raw
|
variant_example.doc = variant_raw
|
||||||
return variant_example
|
return variant_example
|
||||||
return variant_example
|
return variant_example
|
||||||
|
@ -521,30 +521,43 @@ def json_to_examples(doc):
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
for paragraph in doc["paragraphs"]:
|
for paragraph in doc["paragraphs"]:
|
||||||
example = Example(doc=paragraph.get("raw", None))
|
example = Example(doc=paragraph.get("raw", None))
|
||||||
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
ner = []
|
||||||
|
morphs = []
|
||||||
|
sent_starts = []
|
||||||
|
brackets = []
|
||||||
for sent in paragraph["sentences"]:
|
for sent in paragraph["sentences"]:
|
||||||
words = []
|
sent_start_i = len(words)
|
||||||
ids = []
|
|
||||||
tags = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
ner = []
|
|
||||||
for i, token in enumerate(sent["tokens"]):
|
for i, token in enumerate(sent["tokens"]):
|
||||||
words.append(token["orth"])
|
words.append(token["orth"])
|
||||||
ids.append(i)
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
tags.append(token.get('tag', "-"))
|
tags.append(token.get('tag', "-"))
|
||||||
heads.append(token.get("head", 0) + i)
|
heads.append(token.get("head", 0) + sent_start_i + i)
|
||||||
labels.append(token.get("dep", ""))
|
labels.append(token.get("dep", ""))
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
ner.append(token.get("ner", "-"))
|
ner.append(token.get("ner", "-"))
|
||||||
example.add_token_annotation(ids=ids, words=words, tags=tags,
|
morphs.append(token.get("morph", {}))
|
||||||
heads=heads, deps=labels, entities=ner,
|
if i == 0:
|
||||||
brackets=sent.get("brackets", []))
|
sent_starts.append(True)
|
||||||
|
else:
|
||||||
|
sent_starts.append(False)
|
||||||
|
if "brackets" in sent:
|
||||||
|
brackets.extend((b["first"] + sent_start_i,
|
||||||
|
b["last"] + sent_start_i, b["label"])
|
||||||
|
for b in sent["brackets"])
|
||||||
cats = {}
|
cats = {}
|
||||||
for cat in paragraph.get("cats", {}):
|
for cat in paragraph.get("cats", {}):
|
||||||
cats[cat["label"]] = cat["value"]
|
cats[cat["label"]] = cat["value"]
|
||||||
example.add_doc_annotation(cats=cats)
|
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
||||||
|
heads=heads, deps=labels, entities=ner, morphs=morphs,
|
||||||
|
sent_starts=sent_starts, brackets=brackets)
|
||||||
|
example.set_doc_annotation(cats=cats)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
||||||
|
@ -652,15 +665,16 @@ def _consume_ent(tags):
|
||||||
|
|
||||||
|
|
||||||
cdef class TokenAnnotation:
|
cdef class TokenAnnotation:
|
||||||
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
|
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
|
||||||
self.ids = ids if ids else []
|
self.ids = ids if ids else []
|
||||||
self.words = words if words else []
|
self.words = words if words else []
|
||||||
self.tags = tags if tags else []
|
self.tags = tags if tags else []
|
||||||
self.heads = heads if heads else []
|
self.heads = heads if heads else []
|
||||||
self.deps = deps if deps else []
|
self.deps = deps if deps else []
|
||||||
self.entities = entities if entities else []
|
self.entities = entities if entities else []
|
||||||
|
self.morphs = morphs if morphs else []
|
||||||
|
self.sent_starts = sent_starts if sent_starts else []
|
||||||
self.brackets = brackets if brackets else []
|
self.brackets = brackets if brackets else []
|
||||||
self.morphology = morphology if morphology else []
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, token_dict):
|
def from_dict(cls, token_dict):
|
||||||
|
@ -670,7 +684,8 @@ cdef class TokenAnnotation:
|
||||||
heads=token_dict.get("heads", None),
|
heads=token_dict.get("heads", None),
|
||||||
deps=token_dict.get("deps", None),
|
deps=token_dict.get("deps", None),
|
||||||
entities=token_dict.get("entities", None),
|
entities=token_dict.get("entities", None),
|
||||||
morphology=token_dict.get("morphology", None),
|
morphs=token_dict.get("morphs", None),
|
||||||
|
sent_starts=token_dict.get("sent_starts", None),
|
||||||
brackets=token_dict.get("brackets", None))
|
brackets=token_dict.get("brackets", None))
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
@ -680,9 +695,34 @@ cdef class TokenAnnotation:
|
||||||
"heads": self.heads,
|
"heads": self.heads,
|
||||||
"deps": self.deps,
|
"deps": self.deps,
|
||||||
"entities": self.entities,
|
"entities": self.entities,
|
||||||
"morphology": self.morphology,
|
"morphs": self.morphs,
|
||||||
|
"sent_starts": self.sent_starts,
|
||||||
"brackets": self.brackets}
|
"brackets": self.brackets}
|
||||||
|
|
||||||
|
def get_id(self, i):
|
||||||
|
return self.ids[i] if i < len(self.ids) else i
|
||||||
|
|
||||||
|
def get_word(self, i):
|
||||||
|
return self.words[i] if i < len(self.words) else ""
|
||||||
|
|
||||||
|
def get_tag(self, i):
|
||||||
|
return self.tags[i] if i < len(self.tags) else "-"
|
||||||
|
|
||||||
|
def get_head(self, i):
|
||||||
|
return self.heads[i] if i < len(self.heads) else i
|
||||||
|
|
||||||
|
def get_dep(self, i):
|
||||||
|
return self.deps[i] if i < len(self.deps) else ""
|
||||||
|
|
||||||
|
def get_entity(self, i):
|
||||||
|
return self.entities[i] if i < len(self.entities) else "-"
|
||||||
|
|
||||||
|
def get_morph(self, i):
|
||||||
|
return self.morphs[i] if i < len(self.morphs) else set()
|
||||||
|
|
||||||
|
def get_sent_start(self, i):
|
||||||
|
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
||||||
|
|
||||||
|
|
||||||
cdef class DocAnnotation:
|
cdef class DocAnnotation:
|
||||||
def __init__(self, cats=None, links=None):
|
def __init__(self, cats=None, links=None):
|
||||||
|
@ -698,33 +738,33 @@ cdef class DocAnnotation:
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
|
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
|
||||||
goldparse=None):
|
goldparse=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
""" Doc can either be text, or an actual Doc """
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||||
self.token_annotations = token_annotations if token_annotations else []
|
self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
|
||||||
self.goldparse = goldparse
|
self.goldparse = goldparse
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_gold(cls, goldparse, doc=None):
|
def from_gold(cls, goldparse, doc=None):
|
||||||
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
|
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
|
||||||
token_annotation = goldparse.get_token_annotation()
|
token_annotation = goldparse.get_token_annotation()
|
||||||
return cls(doc_annotation, [token_annotation], doc)
|
return cls(doc_annotation, token_annotation, doc)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, example_dict, doc=None):
|
def from_dict(cls, example_dict, doc=None):
|
||||||
token_dicts = example_dict["token_annotations"]
|
token_dict = example_dict["token_annotation"]
|
||||||
token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
|
token_annotation = TokenAnnotation.from_dict(token_dict)
|
||||||
doc_dict = example_dict["doc_annotation"]
|
doc_dict = example_dict["doc_annotation"]
|
||||||
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
||||||
return cls(doc_annotation, token_annotations, doc)
|
return cls(doc_annotation, token_annotation, doc)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
""" Note that this method does NOT export the doc, only the annotations ! """
|
""" Note that this method does NOT export the doc, only the annotations ! """
|
||||||
token_dicts = [t.to_dict() for t in self.token_annotations]
|
token_dict = self.token_annotation.to_dict()
|
||||||
doc_dict = self.doc_annotation.to_dict()
|
doc_dict = self.doc_annotation.to_dict()
|
||||||
return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
|
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
|
@ -737,96 +777,108 @@ cdef class Example:
|
||||||
@property
|
@property
|
||||||
def gold(self):
|
def gold(self):
|
||||||
if self.goldparse is None:
|
if self.goldparse is None:
|
||||||
doc, gold = self.get_gold_parses(merge=True)[0]
|
doc, gold = self.get_gold_parses()[0]
|
||||||
self.goldparse = gold
|
self.goldparse = gold
|
||||||
return self.goldparse
|
return self.goldparse
|
||||||
|
|
||||||
def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
|
def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, morphology=None, brackets=None):
|
deps=None, entities=None, morphs=None,
|
||||||
t = TokenAnnotation(ids=ids, words=words, tags=tags,
|
sent_starts=None, brackets=None):
|
||||||
|
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
||||||
heads=heads, deps=deps, entities=entities,
|
heads=heads, deps=deps, entities=entities,
|
||||||
morphology=morphology, brackets=brackets)
|
morphs=morphs, sent_starts=sent_starts,
|
||||||
self.token_annotations.append(t)
|
brackets=brackets)
|
||||||
|
|
||||||
def add_doc_annotation(self, cats=None, links=None):
|
def set_doc_annotation(self, cats=None, links=None):
|
||||||
if cats:
|
if cats:
|
||||||
self.doc_annotation.cats.update(cats)
|
self.doc_annotation.cats = cats
|
||||||
if links:
|
if links:
|
||||||
self.doc_annotation.links.update(links)
|
self.doc_annotation.links = links
|
||||||
|
|
||||||
def merge_sents(self):
|
def split_sents(self):
|
||||||
""" Merge the list of token annotations into one object and return this new object """
|
""" Split the token annotations into multiple Examples based on
|
||||||
m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
|
sent_starts and return a list of the new Examples"""
|
||||||
m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||||
m_brackets = []
|
s_ids, s_words, s_tags, s_heads = [], [], [], []
|
||||||
i = 0
|
s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
|
||||||
for t in self.token_annotations:
|
s_brackets = []
|
||||||
m_ids.extend(id_ + i for id_ in t.ids)
|
sent_start_i = 0
|
||||||
m_words.extend(t.words)
|
t = self.token_annotation
|
||||||
m_tags.extend(t.tags)
|
split_examples = []
|
||||||
m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
|
for i in range(len(t.words)):
|
||||||
m_deps.extend(t.deps)
|
if i > 0 and t.sent_starts[i] == True:
|
||||||
m_ents.extend(t.entities)
|
s_example.set_token_annotation(ids=s_ids,
|
||||||
m_morph.extend(t.morphology)
|
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
|
||||||
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
|
entities=s_ents, morphs=s_morphs,
|
||||||
for b in t.brackets)
|
sent_starts=s_sent_starts, brackets=s_brackets)
|
||||||
i += len(t.ids)
|
split_examples.append(s_example)
|
||||||
m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||||
heads=m_heads, deps=m_deps, entities=m_ents,
|
s_ids, s_words, s_tags, s_heads = [], [], [], []
|
||||||
morphology=m_morph, brackets=m_brackets)
|
s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
|
||||||
return m_example
|
s_brackets = []
|
||||||
|
sent_start_i = i
|
||||||
|
s_ids.append(t.get_id(i))
|
||||||
|
s_words.append(t.get_word(i))
|
||||||
|
s_tags.append(t.get_tag(i))
|
||||||
|
s_heads.append(t.get_head(i) - sent_start_i)
|
||||||
|
s_deps.append(t.get_dep(i))
|
||||||
|
s_ents.append(t.get_entity(i))
|
||||||
|
s_morphs.append(t.get_morph(i))
|
||||||
|
s_sent_starts.append(t.get_sent_start(i))
|
||||||
|
s_brackets.extend((b[0] - sent_start_i,
|
||||||
|
b[1] - sent_start_i, b[2])
|
||||||
|
for b in t.brackets if b[0] == i)
|
||||||
|
i += 1
|
||||||
|
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
||||||
|
heads=s_heads, deps=s_deps, entities=s_ents,
|
||||||
|
morphs=s_morphs, sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets)
|
||||||
|
split_examples.append(s_example)
|
||||||
|
return split_examples
|
||||||
|
|
||||||
|
|
||||||
def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
|
def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
|
||||||
ignore_misaligned=False):
|
ignore_misaligned=False):
|
||||||
"""Return a list of (doc, GoldParse) objects.
|
"""Return a list of (doc, GoldParse) objects.
|
||||||
If merge is set to True, add all Token annotations to one big list."""
|
If merge is set to True, keep all Token annotations as one big list."""
|
||||||
d = self.doc_annotation
|
d = self.doc_annotation
|
||||||
# merging different sentences
|
# merge == do not modify Example
|
||||||
if merge:
|
if merge:
|
||||||
merged_example = self.merge_sents()
|
t = self.token_annotation
|
||||||
assert(len(merged_example.token_annotations)) == 1
|
doc = self.doc
|
||||||
t = merged_example.token_annotations[0]
|
if not self.doc:
|
||||||
m_doc = merged_example.doc
|
|
||||||
if not m_doc:
|
|
||||||
if not vocab:
|
if not vocab:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
m_doc = Doc(vocab, words=t.words)
|
doc = Doc(vocab, words=t.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
|
gp = GoldParse.from_annotation(doc, d, t,
|
||||||
|
make_projective=make_projective)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
return [(self.doc, gp)]
|
return [(doc, gp)]
|
||||||
# we only have one sentence and an appropriate doc
|
# not merging: one GoldParse per sentence, defining docs with the words
|
||||||
elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
|
# from each sentence
|
||||||
t = self.token_annotations[0]
|
|
||||||
try:
|
|
||||||
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
|
|
||||||
except AlignmentError:
|
|
||||||
if ignore_misaligned:
|
|
||||||
gp = None
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
return [(self.doc, gp)]
|
|
||||||
# not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
|
|
||||||
else:
|
else:
|
||||||
parses = []
|
parses = []
|
||||||
for t in self.token_annotations:
|
split_examples = self.split_sents()
|
||||||
|
for split_example in split_examples:
|
||||||
if not vocab:
|
if not vocab:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
t_doc = Doc(vocab, words=t.words)
|
split_doc = Doc(vocab, words=split_example.token_annotation.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
|
gp = GoldParse.from_annotation(split_doc, d,
|
||||||
|
split_example.token_annotation,
|
||||||
|
make_projective=make_projective)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
if gp is not None:
|
if gp is not None:
|
||||||
parses.append((t_doc, gp))
|
parses.append((split_doc, gp))
|
||||||
return parses
|
return parses
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -881,9 +933,14 @@ cdef class GoldParse:
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
||||||
return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
|
return cls(doc, words=token_annotation.words,
|
||||||
heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
|
tags=token_annotation.tags,
|
||||||
morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
|
heads=token_annotation.heads,
|
||||||
|
deps=token_annotation.deps,
|
||||||
|
entities=token_annotation.entities,
|
||||||
|
morphs=token_annotation.morphs,
|
||||||
|
cats=doc_annotation.cats,
|
||||||
|
links=doc_annotation.links,
|
||||||
make_projective=make_projective)
|
make_projective=make_projective)
|
||||||
|
|
||||||
def get_token_annotation(self):
|
def get_token_annotation(self):
|
||||||
|
@ -893,9 +950,9 @@ cdef class GoldParse:
|
||||||
|
|
||||||
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
||||||
heads=self.heads, deps=self.labels, entities=self.ner,
|
heads=self.heads, deps=self.labels, entities=self.ner,
|
||||||
morphology=self.morphology)
|
morphs=self.morphs)
|
||||||
|
|
||||||
def __init__(self, doc, words=None, tags=None, morphology=None,
|
def __init__(self, doc, words=None, tags=None, morphs=None,
|
||||||
heads=None, deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None, links=None):
|
cats=None, links=None):
|
||||||
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
@ -944,8 +1001,8 @@ cdef class GoldParse:
|
||||||
heads = [None for _ in words]
|
heads = [None for _ in words]
|
||||||
if not deps:
|
if not deps:
|
||||||
deps = [None for _ in words]
|
deps = [None for _ in words]
|
||||||
if not morphology:
|
if not morphs:
|
||||||
morphology = [None for _ in words]
|
morphs = [None for _ in words]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
entities = ["-" for _ in words]
|
entities = ["-" for _ in words]
|
||||||
elif len(entities) == 0:
|
elif len(entities) == 0:
|
||||||
|
@ -971,7 +1028,7 @@ cdef class GoldParse:
|
||||||
self.heads = [None] * len(doc)
|
self.heads = [None] * len(doc)
|
||||||
self.labels = [None] * len(doc)
|
self.labels = [None] * len(doc)
|
||||||
self.ner = [None] * len(doc)
|
self.ner = [None] * len(doc)
|
||||||
self.morphology = [None] * len(doc)
|
self.morphs = [None] * len(doc)
|
||||||
|
|
||||||
# This needs to be done before we align the words
|
# This needs to be done before we align the words
|
||||||
if make_projective and heads is not None and deps is not None:
|
if make_projective and heads is not None and deps is not None:
|
||||||
|
@ -990,7 +1047,7 @@ cdef class GoldParse:
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
|
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
|
||||||
heads=heads, deps=deps, entities=entities, morphology=morphology,
|
heads=heads, deps=deps, entities=entities, morphs=morphs,
|
||||||
brackets=[])
|
brackets=[])
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
|
@ -1000,12 +1057,12 @@ cdef class GoldParse:
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
self.labels[i] = None
|
self.labels[i] = None
|
||||||
self.ner[i] = None
|
self.ner[i] = None
|
||||||
self.morphology[i] = set()
|
self.morphs[i] = set()
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
if i in i2j_multi:
|
if i in i2j_multi:
|
||||||
self.words[i] = words[i2j_multi[i]]
|
self.words[i] = words[i2j_multi[i]]
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
self.morphology[i] = morphology[i2j_multi[i]]
|
self.morphs[i] = morphs[i2j_multi[i]]
|
||||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
||||||
# Set next word in multi-token span as head, until last
|
# Set next word in multi-token span as head, until last
|
||||||
|
@ -1044,7 +1101,7 @@ cdef class GoldParse:
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
self.words[i] = words[gold_i]
|
||||||
self.tags[i] = tags[gold_i]
|
self.tags[i] = tags[gold_i]
|
||||||
self.morphology[i] = morphology[gold_i]
|
self.morphs[i] = morphs[gold_i]
|
||||||
if heads[gold_i] is None:
|
if heads[gold_i] is None:
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -574,9 +574,8 @@ class Language(object):
|
||||||
# Populate vocab
|
# Populate vocab
|
||||||
else:
|
else:
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for token_annotation in example.token_annotations:
|
for word in example.token_annotation.words:
|
||||||
for word in token_annotation.words:
|
_ = self.vocab[word] # noqa: F841
|
||||||
_ = self.vocab[word] # noqa: F841
|
|
||||||
|
|
||||||
if cfg.get("device", -1) >= 0:
|
if cfg.get("device", -1) >= 0:
|
||||||
util.use_gpu(cfg["device"])
|
util.use_gpu(cfg["device"])
|
||||||
|
|
|
@ -565,12 +565,11 @@ class Tagger(Pipe):
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = OrderedDict()
|
new_tag_map = OrderedDict()
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for token_annotation in example.token_annotations:
|
for tag in example.token_annotation.tags:
|
||||||
for tag in token_annotation.tags:
|
if tag in orig_tag_map:
|
||||||
if tag in orig_tag_map:
|
new_tag_map[tag] = orig_tag_map[tag]
|
||||||
new_tag_map[tag] = orig_tag_map[tag]
|
else:
|
||||||
else:
|
new_tag_map[tag] = {POS: X}
|
||||||
new_tag_map[tag] = {POS: X}
|
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
if new_tag_map:
|
if new_tag_map:
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
|
@ -750,11 +749,10 @@ class MultitaskObjective(Tagger):
|
||||||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||||
# for raw_text, doc_annot in gold_tuples:
|
# for raw_text, doc_annot in gold_tuples:
|
||||||
for example in gold_examples:
|
for example in gold_examples:
|
||||||
for token_annotation in example.token_annotations:
|
for i in range(len(example.token_annotation.ids)):
|
||||||
for i in range(len(token_annotation.ids)):
|
label = self.make_label(i, example.token_annotation)
|
||||||
label = self.make_label(i, token_annotation)
|
if label is not None and label not in self.labels:
|
||||||
if label is not None and label not in self.labels:
|
self.labels[label] = len(self.labels)
|
||||||
self.labels[label] = len(self.labels)
|
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt("token_vector_width")
|
token_vector_width = util.env_opt("token_vector_width")
|
||||||
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
|
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
|
||||||
|
|
|
@ -237,7 +237,7 @@ class Scorer(object):
|
||||||
if len(doc) != len(gold):
|
if len(doc) != len(gold):
|
||||||
doc_annotation = DocAnnotation(cats=gold.cats)
|
doc_annotation = DocAnnotation(cats=gold.cats)
|
||||||
token_annotation = gold.orig
|
token_annotation = gold.orig
|
||||||
gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
|
gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
|
||||||
orig = gold.orig
|
orig = gold.orig
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_deps_per_dep = {}
|
gold_deps_per_dep = {}
|
||||||
|
|
|
@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem):
|
||||||
actions[RIGHT][label] = 1
|
actions[RIGHT][label] = 1
|
||||||
actions[REDUCE][label] = 1
|
actions[REDUCE][label] = 1
|
||||||
for example in kwargs.get('gold_parses', []):
|
for example in kwargs.get('gold_parses', []):
|
||||||
for token_annotation in example.token_annotations:
|
heads, labels = nonproj.projectivize(example.token_annotation.heads,
|
||||||
heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
|
example.token_annotation.deps)
|
||||||
for child, head, label in zip(token_annotation.ids, heads, labels):
|
for child, head, label in zip(example.token_annotation.ids, heads, labels):
|
||||||
if label.upper() == 'ROOT' :
|
if label.upper() == 'ROOT' :
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
if head == child:
|
if head == child:
|
||||||
actions[BREAK][label] += 1
|
actions[BREAK][label] += 1
|
||||||
elif head < child:
|
elif head < child:
|
||||||
actions[RIGHT][label] += 1
|
actions[RIGHT][label] += 1
|
||||||
actions[REDUCE][''] += 1
|
actions[REDUCE][''] += 1
|
||||||
elif head > child:
|
elif head > child:
|
||||||
actions[LEFT][label] += 1
|
actions[LEFT][label] += 1
|
||||||
actions[SHIFT][''] += 1
|
actions[SHIFT][''] += 1
|
||||||
if min_freq is not None:
|
if min_freq is not None:
|
||||||
for action, label_freqs in actions.items():
|
for action, label_freqs in actions.items():
|
||||||
for label, freq in list(label_freqs.items()):
|
for label, freq in list(label_freqs.items()):
|
||||||
|
|
|
@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
actions[action][entity_type] = 1
|
actions[action][entity_type] = 1
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for example in kwargs.get('gold_parses', []):
|
for example in kwargs.get('gold_parses', []):
|
||||||
for token_annotation in example.token_annotations:
|
for i, ner_tag in enumerate(example.token_annotation.entities):
|
||||||
for i, ner_tag in enumerate(token_annotation.entities):
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
_, label = ner_tag.split('-', 1)
|
||||||
_, label = ner_tag.split('-', 1)
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
actions[action][label] += 1
|
||||||
actions[action][label] += 1
|
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -81,15 +81,15 @@ def is_decorated(label):
|
||||||
def count_decorated_labels(gold_data):
|
def count_decorated_labels(gold_data):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for example in gold_data:
|
for example in gold_data:
|
||||||
for token_annotation in example.token_annotations:
|
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
|
||||||
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
|
example.token_annotation.deps)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||||
for i, head in enumerate(proj_heads)]
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
for label in deco_deps:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label, 0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for example in gold_data:
|
for example in gold_data:
|
||||||
new_example = Example(doc=example.doc)
|
new_example = Example(doc=example.doc)
|
||||||
for token_annotation in example.token_annotations:
|
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
|
||||||
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
|
example.token_annotation.deps)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||||
for i, head in enumerate(proj_heads)]
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
for label in deco_deps:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label, 0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
# TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
|
proj_token_dict = example.token_annotation.to_dict()
|
||||||
proj_token_dict = token_annotation.to_dict()
|
proj_token_dict["heads"] = proj_heads
|
||||||
proj_token_dict["heads"] = proj_heads
|
proj_token_dict["deps"] = deco_deps
|
||||||
proj_token_dict["deps"] = deco_deps
|
new_example.set_token_annotation(**proj_token_dict)
|
||||||
new_example.add_token_annotation(**proj_token_dict)
|
|
||||||
preprocessed.append(new_example)
|
preprocessed.append(new_example)
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||||
|
@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs):
|
||||||
filtered = []
|
filtered = []
|
||||||
for example in examples:
|
for example in examples:
|
||||||
new_example = Example(doc=example.doc)
|
new_example = Example(doc=example.doc)
|
||||||
for token_annotation in example.token_annotations:
|
filtered_labels = []
|
||||||
filtered_labels = []
|
for label in example.token_annotation.deps:
|
||||||
for label in token_annotation.deps:
|
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
||||||
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
filtered_labels.append(decompose(label)[0])
|
||||||
filtered_labels.append(decompose(label)[0])
|
else:
|
||||||
else:
|
filtered_labels.append(label)
|
||||||
filtered_labels.append(label)
|
filtered_token_dict = example.token_annotation.to_dict()
|
||||||
filtered_token_dict = token_annotation.to_dict()
|
filtered_token_dict["deps"] = filtered_labels
|
||||||
filtered_token_dict["deps"] = filtered_labels
|
new_example.set_token_annotation(**filtered_token_dict)
|
||||||
new_example.add_token_annotation(**filtered_token_dict)
|
|
||||||
filtered.append(new_example)
|
filtered.append(new_example)
|
||||||
return filtered
|
return filtered
|
||||||
|
|
|
@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer):
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab())
|
ner = EntityRecognizer(Vocab())
|
||||||
example = Example(doc=None)
|
example = Example(doc=None)
|
||||||
example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
|
example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
|
||||||
ner.moves.get_actions(gold_parses=[example])
|
ner.moves.get_actions(gold_parses=[example])
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -36,6 +36,16 @@ def doc():
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def merged_dict():
|
||||||
|
return {
|
||||||
|
"ids": [1, 2, 3, 4, 5, 6, 7],
|
||||||
|
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
|
||||||
|
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
|
||||||
|
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
words = ["I", "flew", "to", "London", "."]
|
words = ["I", "flew", "to", "London", "."]
|
||||||
spaces = [True, True, True, False, True]
|
spaces = [True, True, True, False, True]
|
||||||
|
@ -231,7 +241,7 @@ def test_ignore_misaligned(doc):
|
||||||
deps = [t.dep_ for t in doc]
|
deps = [t.dep_ for t in doc]
|
||||||
heads = [t.head.i for t in doc]
|
heads = [t.head.i for t in doc]
|
||||||
|
|
||||||
use_new_align = spacy.gold.USE_NEW_ALIGN
|
saved_use_new_align = spacy.gold.USE_NEW_ALIGN
|
||||||
|
|
||||||
spacy.gold.USE_NEW_ALIGN = False
|
spacy.gold.USE_NEW_ALIGN = False
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
|
@ -270,7 +280,25 @@ def test_ignore_misaligned(doc):
|
||||||
ignore_misaligned=True))
|
ignore_misaligned=True))
|
||||||
assert len(train_reloaded_example) == 0
|
assert len(train_reloaded_example) == 0
|
||||||
|
|
||||||
spacy.gold.USE_NEW_ALIGN = use_new_align
|
spacy.gold.USE_NEW_ALIGN = saved_use_new_align
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_orth_variants(doc):
|
||||||
|
nlp = English()
|
||||||
|
text = doc.text
|
||||||
|
deps = [t.dep_ for t in doc]
|
||||||
|
heads = [t.head.i for t in doc]
|
||||||
|
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "test.jsonl"
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
# due to randomness, test only that this runs with no errors for now
|
||||||
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp,
|
||||||
|
orth_variant_level=0.2))
|
||||||
|
train_goldparse = train_reloaded_example.gold
|
||||||
|
|
||||||
|
|
||||||
# xfail while we have backwards-compatible alignment
|
# xfail while we have backwards-compatible alignment
|
||||||
|
@ -386,71 +414,38 @@ def _train(train_data):
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
tokens_1 = {
|
def test_split_sents(merged_dict):
|
||||||
"ids": [1, 2, 3],
|
|
||||||
"words": ["Hi", "there", "everyone"],
|
|
||||||
"tags": ["INTJ", "ADV", "PRON"],
|
|
||||||
}
|
|
||||||
|
|
||||||
tokens_2 = {
|
|
||||||
"ids": [1, 2, 3, 4],
|
|
||||||
"words": ["It", "is", "just", "me"],
|
|
||||||
"tags": ["PRON", "AUX", "ADV", "PRON"],
|
|
||||||
}
|
|
||||||
|
|
||||||
text0 = "Hi there everyone It is just me"
|
|
||||||
|
|
||||||
|
|
||||||
def test_merge_sents():
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
example = Example()
|
example = Example()
|
||||||
example.add_token_annotation(**tokens_1)
|
example.set_token_annotation(**merged_dict)
|
||||||
example.add_token_annotation(**tokens_2)
|
|
||||||
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
|
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
|
||||||
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object
|
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
|
||||||
|
|
||||||
merged_example = example.merge_sents()
|
split_examples = example.split_sents()
|
||||||
|
assert len(split_examples) == 2
|
||||||
|
|
||||||
token_annotation_1 = example.token_annotations[0]
|
token_annotation_1 = split_examples[0].token_annotation
|
||||||
assert token_annotation_1.ids == [1, 2, 3]
|
assert token_annotation_1.ids == [1, 2, 3]
|
||||||
assert token_annotation_1.words == ["Hi", "there", "everyone"]
|
assert token_annotation_1.words == ["Hi", "there", "everyone"]
|
||||||
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
|
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
|
||||||
|
assert token_annotation_1.sent_starts == [1, 0, 0]
|
||||||
|
|
||||||
token_annotation_m = merged_example.token_annotations[0]
|
token_annotation_2 = split_examples[1].token_annotation
|
||||||
assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
|
assert token_annotation_2.ids == [4, 5, 6, 7]
|
||||||
assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
|
assert token_annotation_2.words == ["It", "is", "just", "me"]
|
||||||
assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
|
assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
|
||||||
|
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
|
||||||
|
|
||||||
|
|
||||||
def test_tuples_to_example():
|
def test_tuples_to_example(merged_dict):
|
||||||
ex = Example()
|
ex = Example()
|
||||||
ex.add_token_annotation(**tokens_1)
|
ex.set_token_annotation(**merged_dict)
|
||||||
ex.add_token_annotation(**tokens_2)
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
|
ex.set_doc_annotation(cats=cats)
|
||||||
ex_dict = ex.to_dict()
|
ex_dict = ex.to_dict()
|
||||||
|
|
||||||
token_dicts = [
|
assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
|
||||||
{
|
assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
|
||||||
"ids": [1, 2, 3],
|
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
|
||||||
"words": ["Hi", "there", "everyone"],
|
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
|
||||||
"tags": ["INTJ", "ADV", "PRON"],
|
assert ex_dict["doc_annotation"]["cats"] == cats
|
||||||
"heads": [],
|
|
||||||
"deps": [],
|
|
||||||
"entities": [],
|
|
||||||
"morphology": [],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"ids": [1, 2, 3, 4],
|
|
||||||
"words": ["It", "is", "just", "me"],
|
|
||||||
"tags": ["PRON", "AUX", "ADV", "PRON"],
|
|
||||||
"heads": [],
|
|
||||||
"deps": [],
|
|
||||||
"entities": [],
|
|
||||||
"morphology": [],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
|
|
||||||
|
|
||||||
assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
|
|
||||||
|
|
|
@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab):
|
||||||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||||
)
|
)
|
||||||
ex = Example(doc=doc)
|
ex = Example(doc=doc)
|
||||||
ex.add_token_annotation(entities=annot["entities"])
|
ex.set_token_annotation(entities=annot["entities"])
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab):
|
||||||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||||
)
|
)
|
||||||
ex = Example(doc=doc)
|
ex = Example(doc=doc)
|
||||||
ex.add_token_annotation(entities=annot["entities"])
|
ex.set_token_annotation(entities=annot["entities"])
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user