mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Restructure Example with merged sents as default (#4632)
* Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Restructure Example with merged sents as default An `Example` now includes a single `TokenAnnotation` that includes all the information from one `Doc` (=JSON `paragraph`). If required, the individual sentences can be returned as a list of examples with `Example.split_sents()` with no raw text available. * Input/output a single `Example.token_annotation` * Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries * Replace `Example.merge_sents()` with `Example.split_sents()` * Modify components to use a single `Example.token_annotation` * Pipeline components * conllu2json converter * Rework/rename `add_token_annotation()` and `add_doc_annotation()` to `set_token_annotation()` and `set_doc_annotation()`, functions that set rather then appending/extending. * Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse` * Add getters to `TokenAnnotation` to supply default values when a given attribute is not available * `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only applied on single examples, so the `GoldParse` is returned saved in the provided `Example` rather than creating a new `Example` with no other internal annotation * Update tests for API changes and `merge_sents()` vs. `split_sents()` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Fix make_orth_variants() Fix bug in make_orth_variants() related to conversion from multiple to one TokenAnnotation per Example. * Add basic test for make_orth_variants() * Replace try/except with conditionals * Replace default morph value with set
This commit is contained in:
		
							parent
							
								
									44829950ba
								
							
						
					
					
						commit
						392c4880d9
					
				|  | @ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): | |||
|     checked_for_ner = False | ||||
|     has_ner_tags = False | ||||
|     for i, example in enumerate(conll_data): | ||||
|         for token_annotation in example.token_annotations: | ||||
|             if not checked_for_ner: | ||||
|                 has_ner_tags = is_ner(token_annotation.entities[0]) | ||||
|                 checked_for_ner = True | ||||
|             sentences.append(generate_sentence(token_annotation, has_ner_tags)) | ||||
|             # Real-sized documents could be extracted using the comments on the | ||||
|             # conluu document | ||||
|             if len(sentences) % n_sents == 0: | ||||
|                 doc = create_doc(sentences, i) | ||||
|                 docs.append(doc) | ||||
|                 sentences = [] | ||||
|         if not checked_for_ner: | ||||
|             has_ner_tags = is_ner(example.token_annotation.entities[0]) | ||||
|             checked_for_ner = True | ||||
|         sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) | ||||
|         # Real-sized documents could be extracted using the comments on the | ||||
|         # conllu document | ||||
|         if len(sentences) % n_sents == 0: | ||||
|             doc = create_doc(sentences, i) | ||||
|             docs.append(doc) | ||||
|             sentences = [] | ||||
|     return docs | ||||
| 
 | ||||
| 
 | ||||
|  | @ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0): | |||
|                     print(line) | ||||
|                     raise | ||||
|             example = Example(doc=None) | ||||
|             example.add_token_annotation(ids=ids, words=words, tags=tags, | ||||
|             example.set_token_annotation(ids=ids, words=words, tags=tags, | ||||
|                                          heads=heads, deps=deps, entities=ents) | ||||
|             yield example | ||||
|             i += 1 | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ cdef class GoldParse: | |||
|     cdef public int loss | ||||
|     cdef public list words | ||||
|     cdef public list tags | ||||
|     cdef public list morphology | ||||
|     cdef public list morphs | ||||
|     cdef public list heads | ||||
|     cdef public list labels | ||||
|     cdef public dict orths | ||||
|  | @ -45,7 +45,8 @@ cdef class TokenAnnotation: | |||
|     cdef public list heads | ||||
|     cdef public list deps | ||||
|     cdef public list entities | ||||
|     cdef public list morphology | ||||
|     cdef public list morphs | ||||
|     cdef public list sent_starts | ||||
|     cdef public list brackets | ||||
| 
 | ||||
| 
 | ||||
|  | @ -56,7 +57,7 @@ cdef class DocAnnotation: | |||
| 
 | ||||
| cdef class Example: | ||||
|     cdef public object doc | ||||
|     cdef public list token_annotations | ||||
|     cdef public TokenAnnotation token_annotation | ||||
|     cdef public DocAnnotation doc_annotation | ||||
|     cdef public object goldparse | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										433
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
							
						
						
									
										433
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							|  | @ -215,7 +215,7 @@ class GoldCorpus(object): | |||
|             ex_dict = example.to_dict() | ||||
|             text = example.text | ||||
|             srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) | ||||
|             n += len(example.token_annotations) | ||||
|             n += 1 | ||||
|             if limit and n >= limit: | ||||
|                 break | ||||
| 
 | ||||
|  | @ -271,7 +271,7 @@ class GoldCorpus(object): | |||
|                 raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) | ||||
|             for example in examples: | ||||
|                 yield example | ||||
|                 i += len(example.token_annotations) | ||||
|                 i += 1 | ||||
|                 if limit and i >= limit: | ||||
|                     return | ||||
| 
 | ||||
|  | @ -286,15 +286,14 @@ class GoldCorpus(object): | |||
|         yield from self.read_examples(locs, limit=self.limit) | ||||
| 
 | ||||
|     def count_train(self): | ||||
|         # TODO: should this count words or sentences ? | ||||
|         """Returns count of words in train examples""" | ||||
|         n = 0 | ||||
|         i = 0 | ||||
|         for example in self.train_examples: | ||||
|             for token_annotation in example.token_annotations: | ||||
|                 n += len(token_annotation.words) | ||||
|                 if self.limit and i >= self.limit: | ||||
|                     break | ||||
|                 i += 1 | ||||
|             n += len(example.token_annotation.words) | ||||
|             if self.limit and i >= self.limit: | ||||
|                 break | ||||
|             i += 1 | ||||
|         return n | ||||
| 
 | ||||
|     def train_dataset(self, nlp, gold_preproc=False, max_length=None, | ||||
|  | @ -328,18 +327,27 @@ class GoldCorpus(object): | |||
|     def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, | ||||
|                        noise_level=0.0, orth_variant_level=0.0, | ||||
|                        make_projective=False, ignore_misaligned=False): | ||||
|         """ Setting gold_preproc will result in creating a doc per 'sentence' """ | ||||
|         """ Setting gold_preproc will result in creating a doc per sentence """ | ||||
|         for example in examples: | ||||
|             if gold_preproc: | ||||
|                 example.doc = None | ||||
|                 split_examples = example.split_sents() | ||||
|                 example_golds = [] | ||||
|                 for split_example in split_examples: | ||||
|                     split_example_docs = cls._make_docs(nlp, split_example, | ||||
|                             gold_preproc, noise_level=noise_level, | ||||
|                             orth_variant_level=orth_variant_level) | ||||
|                     split_example_golds = cls._make_golds(split_example_docs, | ||||
|                             vocab=nlp.vocab, make_projective=make_projective, | ||||
|                             ignore_misaligned=ignore_misaligned) | ||||
|                     example_golds.extend(split_example_golds) | ||||
|             else: | ||||
|                 example = example.merge_sents() | ||||
|             example_docs = cls._make_docs(nlp, example, | ||||
|                                       gold_preproc, noise_level=noise_level, | ||||
|                                       orth_variant_level=orth_variant_level) | ||||
|             example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, | ||||
|                                             make_projective=make_projective, | ||||
|                                             ignore_misaligned=ignore_misaligned) | ||||
|                 example_docs = cls._make_docs(nlp, example, | ||||
|                         gold_preproc, noise_level=noise_level, | ||||
|                         orth_variant_level=orth_variant_level) | ||||
|                 example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, | ||||
|                         make_projective=make_projective, | ||||
|                         ignore_misaligned=ignore_misaligned) | ||||
|             for ex in example_golds: | ||||
|                 if ex.goldparse is not None: | ||||
|                     if (not max_length) or len(ex.doc) < max_length: | ||||
|  | @ -353,35 +361,28 @@ class GoldCorpus(object): | |||
|             var_text = add_noise(var_example.text, noise_level) | ||||
|             var_doc = nlp.make_doc(var_text) | ||||
|             var_example.doc = var_doc | ||||
|             return [var_example] | ||||
|         else: | ||||
|             doc_examples = [] | ||||
|             for token_annotation in var_example.token_annotations: | ||||
|                 t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) | ||||
|                 doc_example = Example(doc_annotation=example.doc_annotation, | ||||
|                                       token_annotations=[token_annotation], | ||||
|                                       doc=t_doc) | ||||
|                 doc_examples.append(doc_example) | ||||
|             return doc_examples | ||||
|             var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) | ||||
|             var_example.doc = var_doc | ||||
|         return [var_example] | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _make_golds(cls, examples, vocab=None, make_projective=False, | ||||
|                     ignore_misaligned=False): | ||||
|         gold_examples = [] | ||||
|         for example in examples: | ||||
|             gold_parses = example.get_gold_parses(vocab=vocab, | ||||
|                     make_projective=make_projective, | ||||
|                     ignore_misaligned=ignore_misaligned) | ||||
|             for (doc, gold) in gold_parses: | ||||
|                 ex = Example(doc=doc) | ||||
|                 ex.goldparse = gold | ||||
|                 gold_examples.append(ex) | ||||
|         return gold_examples | ||||
|             assert len(gold_parses) == 1 | ||||
|             assert gold_parses[0][0] == example.doc | ||||
|             example.goldparse = gold_parses[0][1] | ||||
|         return examples | ||||
| 
 | ||||
| 
 | ||||
| def make_orth_variants(nlp, example, orth_variant_level=0.0): | ||||
|     if random.random() >= orth_variant_level: | ||||
|         return example | ||||
|     if not example.token_annotations: | ||||
|     if not example.token_annotation: | ||||
|         return example | ||||
|     raw = example.text | ||||
|     if random.random() >= 0.5: | ||||
|  | @ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | |||
|     ndpv = nlp.Defaults.paired_orth_variants | ||||
|     # modify words in paragraph_tuples | ||||
|     variant_example = Example(doc=raw) | ||||
|     for token_annotation in example.token_annotations: | ||||
|         words = token_annotation.words | ||||
|         tags = token_annotation.tags | ||||
|         if not words or not tags: | ||||
|            # add the unmodified annotation | ||||
|             token_dict = token_annotation.to_dict() | ||||
|             variant_example.add_token_annotation(**token_dict) | ||||
|         else: | ||||
|             if lower: | ||||
|                 words = [w.lower() for w in words] | ||||
|             # single variants | ||||
|             punct_choices = [random.choice(x["variants"]) for x in ndsv] | ||||
|             for word_idx in range(len(words)): | ||||
|                 for punct_idx in range(len(ndsv)): | ||||
|                     if tags[word_idx] in ndsv[punct_idx]["tags"] \ | ||||
|                             and words[word_idx] in ndsv[punct_idx]["variants"]: | ||||
|                         words[word_idx] = punct_choices[punct_idx] | ||||
|             # paired variants | ||||
|             punct_choices = [random.choice(x["variants"]) for x in ndpv] | ||||
|             for word_idx in range(len(words)): | ||||
|                 for punct_idx in range(len(ndpv)): | ||||
|                     if tags[word_idx] in ndpv[punct_idx]["tags"] \ | ||||
|                             and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): | ||||
|                         # backup option: random left vs. right from pair | ||||
|                         pair_idx = random.choice([0, 1]) | ||||
|                         # best option: rely on paired POS tags like `` / '' | ||||
|                         if len(ndpv[punct_idx]["tags"]) == 2: | ||||
|                             pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) | ||||
|                         # next best option: rely on position in variants | ||||
|                         # (may not be unambiguous, so order of variants matters) | ||||
|                         else: | ||||
|                             for pair in ndpv[punct_idx]["variants"]: | ||||
|                                 if words[word_idx] in pair: | ||||
|                                     pair_idx = pair.index(words[word_idx]) | ||||
|                         words[word_idx] = punct_choices[punct_idx][pair_idx] | ||||
|     token_annotation = example.token_annotation | ||||
|     words = token_annotation.words | ||||
|     tags = token_annotation.tags | ||||
|     if not words or not tags: | ||||
|        # add the unmodified annotation | ||||
|         token_dict = token_annotation.to_dict() | ||||
|         variant_example.set_token_annotation(**token_dict) | ||||
|     else: | ||||
|         if lower: | ||||
|             words = [w.lower() for w in words] | ||||
|         # single variants | ||||
|         punct_choices = [random.choice(x["variants"]) for x in ndsv] | ||||
|         for word_idx in range(len(words)): | ||||
|             for punct_idx in range(len(ndsv)): | ||||
|                 if tags[word_idx] in ndsv[punct_idx]["tags"] \ | ||||
|                         and words[word_idx] in ndsv[punct_idx]["variants"]: | ||||
|                     words[word_idx] = punct_choices[punct_idx] | ||||
|         # paired variants | ||||
|         punct_choices = [random.choice(x["variants"]) for x in ndpv] | ||||
|         for word_idx in range(len(words)): | ||||
|             for punct_idx in range(len(ndpv)): | ||||
|                 if tags[word_idx] in ndpv[punct_idx]["tags"] \ | ||||
|                         and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): | ||||
|                     # backup option: random left vs. right from pair | ||||
|                     pair_idx = random.choice([0, 1]) | ||||
|                     # best option: rely on paired POS tags like `` / '' | ||||
|                     if len(ndpv[punct_idx]["tags"]) == 2: | ||||
|                         pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) | ||||
|                     # next best option: rely on position in variants | ||||
|                     # (may not be unambiguous, so order of variants matters) | ||||
|                     else: | ||||
|                         for pair in ndpv[punct_idx]["variants"]: | ||||
|                             if words[word_idx] in pair: | ||||
|                                 pair_idx = pair.index(words[word_idx]) | ||||
|                     words[word_idx] = punct_choices[punct_idx][pair_idx] | ||||
| 
 | ||||
|             token_dict = token_annotation.to_dict() | ||||
|             token_dict["words"] = words | ||||
|             token_dict["tags"] = tags | ||||
|             variant_example.add_token_annotation(**token_dict) | ||||
|         token_dict = token_annotation.to_dict() | ||||
|         token_dict["words"] = words | ||||
|         token_dict["tags"] = tags | ||||
|         variant_example.set_token_annotation(**token_dict) | ||||
|     # modify raw to match variant_paragraph_tuples | ||||
|     if raw is not None: | ||||
|         variants = [] | ||||
|  | @ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): | |||
|         while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): | ||||
|             variant_raw += raw[raw_idx] | ||||
|             raw_idx += 1 | ||||
|         for token_annotation in variant_example.token_annotations: | ||||
|             for word in token_annotation.words: | ||||
|                 match_found = False | ||||
|                 # add identical word | ||||
|                 if word not in variants and raw[raw_idx:].startswith(word): | ||||
|                     variant_raw += word | ||||
|                     raw_idx += len(word) | ||||
|                     match_found = True | ||||
|                 # add variant word | ||||
|                 else: | ||||
|                     for variant in variants: | ||||
|                         if not match_found and \ | ||||
|                                 raw[raw_idx:].startswith(variant): | ||||
|                             raw_idx += len(variant) | ||||
|                             variant_raw += word | ||||
|                             match_found = True | ||||
|                 # something went wrong, abort | ||||
|                 # (add a warning message?) | ||||
|                 if not match_found: | ||||
|                     return example | ||||
|                 # add following whitespace | ||||
|                 while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): | ||||
|                     variant_raw += raw[raw_idx] | ||||
|                     raw_idx += 1 | ||||
|         for word in variant_example.token_annotation.words: | ||||
|             match_found = False | ||||
|             # add identical word | ||||
|             if word not in variants and raw[raw_idx:].startswith(word): | ||||
|                 variant_raw += word | ||||
|                 raw_idx += len(word) | ||||
|                 match_found = True | ||||
|             # add variant word | ||||
|             else: | ||||
|                 for variant in variants: | ||||
|                     if not match_found and \ | ||||
|                             raw[raw_idx:].startswith(variant): | ||||
|                         raw_idx += len(variant) | ||||
|                         variant_raw += word | ||||
|                         match_found = True | ||||
|             # something went wrong, abort | ||||
|             # (add a warning message?) | ||||
|             if not match_found: | ||||
|                 return example | ||||
|             # add following whitespace | ||||
|             while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): | ||||
|                 variant_raw += raw[raw_idx] | ||||
|                 raw_idx += 1 | ||||
|         variant_example.doc = variant_raw | ||||
|         return variant_example | ||||
|     return variant_example | ||||
|  | @ -521,30 +521,43 @@ def json_to_examples(doc): | |||
|     paragraphs = [] | ||||
|     for paragraph in doc["paragraphs"]: | ||||
|         example = Example(doc=paragraph.get("raw", None)) | ||||
|         words = [] | ||||
|         ids = [] | ||||
|         tags = [] | ||||
|         heads = [] | ||||
|         labels = [] | ||||
|         ner = [] | ||||
|         morphs = [] | ||||
|         sent_starts = [] | ||||
|         brackets = [] | ||||
|         for sent in paragraph["sentences"]: | ||||
|             words = [] | ||||
|             ids = [] | ||||
|             tags = [] | ||||
|             heads = [] | ||||
|             labels = [] | ||||
|             ner = [] | ||||
|             sent_start_i = len(words) | ||||
|             for i, token in enumerate(sent["tokens"]): | ||||
|                 words.append(token["orth"]) | ||||
|                 ids.append(i) | ||||
|                 ids.append(token.get('id', sent_start_i + i)) | ||||
|                 tags.append(token.get('tag', "-")) | ||||
|                 heads.append(token.get("head", 0) + i) | ||||
|                 heads.append(token.get("head", 0) + sent_start_i + i) | ||||
|                 labels.append(token.get("dep", "")) | ||||
|                 # Ensure ROOT label is case-insensitive | ||||
|                 if labels[-1].lower() == "root": | ||||
|                     labels[-1] = "ROOT" | ||||
|                 ner.append(token.get("ner", "-")) | ||||
|             example.add_token_annotation(ids=ids, words=words, tags=tags, | ||||
|                                         heads=heads, deps=labels, entities=ner, | ||||
|                                         brackets=sent.get("brackets", [])) | ||||
|                 morphs.append(token.get("morph", {})) | ||||
|                 if i == 0: | ||||
|                     sent_starts.append(True) | ||||
|                 else: | ||||
|                     sent_starts.append(False) | ||||
|             if "brackets" in sent: | ||||
|                 brackets.extend((b["first"] + sent_start_i, | ||||
|                                  b["last"] + sent_start_i, b["label"]) | ||||
|                                  for b in sent["brackets"]) | ||||
|         cats = {} | ||||
|         for cat in paragraph.get("cats", {}): | ||||
|             cats[cat["label"]] = cat["value"] | ||||
|         example.add_doc_annotation(cats=cats) | ||||
|         example.set_token_annotation(ids=ids, words=words, tags=tags, | ||||
|                 heads=heads, deps=labels, entities=ner, morphs=morphs, | ||||
|                 sent_starts=sent_starts, brackets=brackets) | ||||
|         example.set_doc_annotation(cats=cats) | ||||
|         yield example | ||||
| 
 | ||||
| 
 | ||||
|  | @ -652,15 +665,16 @@ def _consume_ent(tags): | |||
| 
 | ||||
| 
 | ||||
| cdef class TokenAnnotation: | ||||
|     def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None): | ||||
|     def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None): | ||||
|         self.ids = ids if ids else [] | ||||
|         self.words = words if words else [] | ||||
|         self.tags = tags if tags else [] | ||||
|         self.heads = heads if heads else [] | ||||
|         self.deps = deps if deps else [] | ||||
|         self.entities = entities if entities else [] | ||||
|         self.morphs = morphs if morphs else [] | ||||
|         self.sent_starts = sent_starts if sent_starts else [] | ||||
|         self.brackets = brackets if brackets else [] | ||||
|         self.morphology = morphology if morphology else [] | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_dict(cls, token_dict): | ||||
|  | @ -670,7 +684,8 @@ cdef class TokenAnnotation: | |||
|                    heads=token_dict.get("heads", None), | ||||
|                    deps=token_dict.get("deps", None), | ||||
|                    entities=token_dict.get("entities", None), | ||||
|                    morphology=token_dict.get("morphology", None), | ||||
|                    morphs=token_dict.get("morphs", None), | ||||
|                    sent_starts=token_dict.get("sent_starts", None), | ||||
|                    brackets=token_dict.get("brackets", None)) | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|  | @ -680,9 +695,34 @@ cdef class TokenAnnotation: | |||
|                 "heads": self.heads, | ||||
|                 "deps": self.deps, | ||||
|                 "entities": self.entities, | ||||
|                 "morphology": self.morphology, | ||||
|                 "morphs": self.morphs, | ||||
|                 "sent_starts": self.sent_starts, | ||||
|                 "brackets": self.brackets} | ||||
| 
 | ||||
|     def get_id(self, i): | ||||
|         return self.ids[i] if i < len(self.ids) else i | ||||
| 
 | ||||
|     def get_word(self, i): | ||||
|         return self.words[i] if i < len(self.words) else "" | ||||
| 
 | ||||
|     def get_tag(self, i): | ||||
|         return self.tags[i] if i < len(self.tags) else "-" | ||||
| 
 | ||||
|     def get_head(self, i): | ||||
|         return self.heads[i] if i < len(self.heads) else i | ||||
| 
 | ||||
|     def get_dep(self, i): | ||||
|         return self.deps[i] if i < len(self.deps) else "" | ||||
| 
 | ||||
|     def get_entity(self, i): | ||||
|         return self.entities[i] if i < len(self.entities) else "-" | ||||
| 
 | ||||
|     def get_morph(self, i): | ||||
|         return self.morphs[i] if i < len(self.morphs) else set() | ||||
| 
 | ||||
|     def get_sent_start(self, i): | ||||
|         return self.sent_starts[i] if i < len(self.sent_starts) else None | ||||
| 
 | ||||
| 
 | ||||
| cdef class DocAnnotation: | ||||
|     def __init__(self, cats=None, links=None): | ||||
|  | @ -698,33 +738,33 @@ cdef class DocAnnotation: | |||
| 
 | ||||
| 
 | ||||
| cdef class Example: | ||||
|     def __init__(self, doc_annotation=None, token_annotations=None, doc=None, | ||||
|     def __init__(self, doc_annotation=None, token_annotation=None, doc=None, | ||||
|                  goldparse=None): | ||||
|         """ Doc can either be text, or an actual Doc """ | ||||
|         self.doc = doc | ||||
|         self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() | ||||
|         self.token_annotations = token_annotations if token_annotations else [] | ||||
|         self.token_annotation = token_annotation if token_annotation else TokenAnnotation() | ||||
|         self.goldparse = goldparse | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_gold(cls, goldparse, doc=None): | ||||
|         doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) | ||||
|         token_annotation = goldparse.get_token_annotation() | ||||
|         return cls(doc_annotation, [token_annotation], doc) | ||||
|         return cls(doc_annotation, token_annotation, doc) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_dict(cls, example_dict, doc=None): | ||||
|         token_dicts = example_dict["token_annotations"] | ||||
|         token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts] | ||||
|         token_dict = example_dict["token_annotation"] | ||||
|         token_annotation = TokenAnnotation.from_dict(token_dict) | ||||
|         doc_dict = example_dict["doc_annotation"] | ||||
|         doc_annotation = DocAnnotation.from_dict(doc_dict) | ||||
|         return cls(doc_annotation, token_annotations, doc) | ||||
|         return cls(doc_annotation, token_annotation, doc) | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|         """ Note that this method does NOT export the doc, only the annotations ! """ | ||||
|         token_dicts = [t.to_dict() for t in self.token_annotations] | ||||
|         token_dict = self.token_annotation.to_dict() | ||||
|         doc_dict = self.doc_annotation.to_dict() | ||||
|         return {"token_annotations": token_dicts, "doc_annotation": doc_dict} | ||||
|         return {"token_annotation": token_dict, "doc_annotation": doc_dict} | ||||
| 
 | ||||
|     @property | ||||
|     def text(self): | ||||
|  | @ -737,96 +777,108 @@ cdef class Example: | |||
|     @property | ||||
|     def gold(self): | ||||
|         if self.goldparse is None: | ||||
|             doc, gold = self.get_gold_parses(merge=True)[0] | ||||
|             doc, gold = self.get_gold_parses()[0] | ||||
|             self.goldparse = gold | ||||
|         return self.goldparse | ||||
| 
 | ||||
|     def add_token_annotation(self, ids=None, words=None, tags=None, heads=None, | ||||
|                              deps=None, entities=None, morphology=None, brackets=None): | ||||
|         t = TokenAnnotation(ids=ids, words=words, tags=tags, | ||||
|     def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, | ||||
|                              deps=None, entities=None, morphs=None, | ||||
|                              sent_starts=None, brackets=None): | ||||
|         self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, | ||||
|                             heads=heads, deps=deps, entities=entities, | ||||
|                             morphology=morphology, brackets=brackets) | ||||
|         self.token_annotations.append(t) | ||||
|                             morphs=morphs, sent_starts=sent_starts, | ||||
|                             brackets=brackets) | ||||
| 
 | ||||
|     def add_doc_annotation(self, cats=None, links=None): | ||||
|     def set_doc_annotation(self, cats=None, links=None): | ||||
|         if cats: | ||||
|             self.doc_annotation.cats.update(cats) | ||||
|             self.doc_annotation.cats = cats | ||||
|         if links: | ||||
|             self.doc_annotation.links.update(links) | ||||
|             self.doc_annotation.links = links | ||||
| 
 | ||||
|     def merge_sents(self): | ||||
|         """ Merge the list of token annotations into one object and return this new object """ | ||||
|         m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation) | ||||
|         m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], [] | ||||
|         m_brackets = [] | ||||
|         i = 0 | ||||
|         for t in self.token_annotations: | ||||
|             m_ids.extend(id_ + i for id_ in t.ids) | ||||
|             m_words.extend(t.words) | ||||
|             m_tags.extend(t.tags) | ||||
|             m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads)) | ||||
|             m_deps.extend(t.deps) | ||||
|             m_ents.extend(t.entities) | ||||
|             m_morph.extend(t.morphology) | ||||
|             m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) | ||||
|                               for b in t.brackets) | ||||
|             i += len(t.ids) | ||||
|         m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags, | ||||
|                                        heads=m_heads, deps=m_deps, entities=m_ents, | ||||
|                                        morphology=m_morph, brackets=m_brackets) | ||||
|         return m_example | ||||
|     def split_sents(self): | ||||
|         """ Split the token annotations into multiple Examples based on | ||||
|         sent_starts and return a list of the new Examples""" | ||||
|         s_example = Example(doc=None, doc_annotation=self.doc_annotation) | ||||
|         s_ids, s_words, s_tags, s_heads = [], [], [], [] | ||||
|         s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] | ||||
|         s_brackets = [] | ||||
|         sent_start_i = 0 | ||||
|         t = self.token_annotation | ||||
|         split_examples = [] | ||||
|         for i in range(len(t.words)): | ||||
|             if i > 0 and t.sent_starts[i] == True: | ||||
|                 s_example.set_token_annotation(ids=s_ids, | ||||
|                         words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, | ||||
|                         entities=s_ents, morphs=s_morphs, | ||||
|                         sent_starts=s_sent_starts, brackets=s_brackets) | ||||
|                 split_examples.append(s_example) | ||||
|                 s_example = Example(doc=None, doc_annotation=self.doc_annotation) | ||||
|                 s_ids, s_words, s_tags, s_heads = [], [], [], [] | ||||
|                 s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], [] | ||||
|                 s_brackets = [] | ||||
|                 sent_start_i = i | ||||
|             s_ids.append(t.get_id(i)) | ||||
|             s_words.append(t.get_word(i)) | ||||
|             s_tags.append(t.get_tag(i)) | ||||
|             s_heads.append(t.get_head(i) - sent_start_i) | ||||
|             s_deps.append(t.get_dep(i)) | ||||
|             s_ents.append(t.get_entity(i)) | ||||
|             s_morphs.append(t.get_morph(i)) | ||||
|             s_sent_starts.append(t.get_sent_start(i)) | ||||
|             s_brackets.extend((b[0] - sent_start_i, | ||||
|                                b[1] - sent_start_i, b[2]) | ||||
|                                for b in t.brackets if b[0] == i) | ||||
|             i += 1 | ||||
|         s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, | ||||
|                 heads=s_heads, deps=s_deps, entities=s_ents, | ||||
|                 morphs=s_morphs, sent_starts=s_sent_starts, | ||||
|                 brackets=s_brackets) | ||||
|         split_examples.append(s_example) | ||||
|         return split_examples | ||||
| 
 | ||||
| 
 | ||||
|     def get_gold_parses(self, merge=False, vocab=None, make_projective=False, | ||||
|     def get_gold_parses(self, merge=True, vocab=None, make_projective=False, | ||||
|                         ignore_misaligned=False): | ||||
|         """Return a list of (doc, GoldParse) objects. | ||||
|         If merge is set to True, add all Token annotations to one big list.""" | ||||
|         If merge is set to True, keep all Token annotations as one big list.""" | ||||
|         d = self.doc_annotation | ||||
|         # merging different sentences | ||||
|         # merge == do not modify Example | ||||
|         if merge: | ||||
|             merged_example = self.merge_sents() | ||||
|             assert(len(merged_example.token_annotations)) == 1 | ||||
|             t = merged_example.token_annotations[0] | ||||
|             m_doc = merged_example.doc | ||||
|             if not m_doc: | ||||
|             t = self.token_annotation | ||||
|             doc = self.doc | ||||
|             if not self.doc: | ||||
|                 if not vocab: | ||||
|                     raise ValueError(Errors.E998) | ||||
|                 m_doc = Doc(vocab, words=t.words) | ||||
|                 doc = Doc(vocab, words=t.words) | ||||
|             try: | ||||
|                 gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective) | ||||
|                 gp = GoldParse.from_annotation(doc, d, t, | ||||
|                                                make_projective=make_projective) | ||||
|             except AlignmentError: | ||||
|                 if ignore_misaligned: | ||||
|                     gp = None | ||||
|                 else: | ||||
|                     raise | ||||
|             return [(self.doc, gp)] | ||||
|         # we only have one sentence and an appropriate doc | ||||
|         elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc): | ||||
|             t = self.token_annotations[0] | ||||
|             try: | ||||
|                 gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective) | ||||
|             except AlignmentError: | ||||
|                 if ignore_misaligned: | ||||
|                     gp = None | ||||
|                 else: | ||||
|                     raise | ||||
|             return [(self.doc, gp)] | ||||
|         # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence | ||||
|             return [(doc, gp)] | ||||
|         # not merging: one GoldParse per sentence, defining docs with the words | ||||
|         # from each sentence | ||||
|         else: | ||||
|             parses = [] | ||||
|             for t in self.token_annotations: | ||||
|             split_examples = self.split_sents() | ||||
|             for split_example in split_examples: | ||||
|                 if not vocab: | ||||
|                     raise ValueError(Errors.E998) | ||||
|                 t_doc = Doc(vocab, words=t.words) | ||||
|                 split_doc = Doc(vocab, words=split_example.token_annotation.words) | ||||
|                 try: | ||||
|                     gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective) | ||||
|                     gp = GoldParse.from_annotation(split_doc, d, | ||||
|                             split_example.token_annotation, | ||||
|                             make_projective=make_projective) | ||||
|                 except AlignmentError: | ||||
|                     if ignore_misaligned: | ||||
|                         gp = None | ||||
|                     else: | ||||
|                         raise | ||||
|                 if gp is not None: | ||||
|                     parses.append((t_doc, gp)) | ||||
|                     parses.append((split_doc, gp)) | ||||
|             return parses | ||||
| 
 | ||||
|     @classmethod | ||||
|  | @ -881,9 +933,14 @@ cdef class GoldParse: | |||
|     """ | ||||
|     @classmethod | ||||
|     def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): | ||||
|         return cls(doc, words=token_annotation.words, tags=token_annotation.tags, | ||||
|                    heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, | ||||
|                    morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links, | ||||
|         return cls(doc, words=token_annotation.words, | ||||
|                    tags=token_annotation.tags, | ||||
|                    heads=token_annotation.heads, | ||||
|                    deps=token_annotation.deps, | ||||
|                    entities=token_annotation.entities, | ||||
|                    morphs=token_annotation.morphs, | ||||
|                    cats=doc_annotation.cats, | ||||
|                    links=doc_annotation.links, | ||||
|                    make_projective=make_projective) | ||||
| 
 | ||||
|     def get_token_annotation(self): | ||||
|  | @ -893,9 +950,9 @@ cdef class GoldParse: | |||
| 
 | ||||
|         return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, | ||||
|                                heads=self.heads, deps=self.labels, entities=self.ner, | ||||
|                                morphology=self.morphology) | ||||
|                                morphs=self.morphs) | ||||
| 
 | ||||
|     def __init__(self, doc, words=None, tags=None, morphology=None, | ||||
|     def __init__(self, doc, words=None, tags=None, morphs=None, | ||||
|                  heads=None, deps=None, entities=None, make_projective=False, | ||||
|                  cats=None, links=None): | ||||
|         """Create a GoldParse. The fields will not be initialized if len(doc) is zero. | ||||
|  | @ -944,8 +1001,8 @@ cdef class GoldParse: | |||
|                 heads = [None for _ in words] | ||||
|             if not deps: | ||||
|                 deps = [None for _ in words] | ||||
|             if not morphology: | ||||
|                 morphology = [None for _ in words] | ||||
|             if not morphs: | ||||
|                 morphs = [None for _ in words] | ||||
|             if entities is None: | ||||
|                 entities = ["-" for _ in words] | ||||
|             elif len(entities) == 0: | ||||
|  | @ -971,7 +1028,7 @@ cdef class GoldParse: | |||
|             self.heads = [None] * len(doc) | ||||
|             self.labels = [None] * len(doc) | ||||
|             self.ner = [None] * len(doc) | ||||
|             self.morphology = [None] * len(doc) | ||||
|             self.morphs = [None] * len(doc) | ||||
| 
 | ||||
|             # This needs to be done before we align the words | ||||
|             if make_projective and heads is not None and deps is not None: | ||||
|  | @ -990,7 +1047,7 @@ cdef class GoldParse: | |||
|             self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] | ||||
| 
 | ||||
|             self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, | ||||
|                                         heads=heads, deps=deps, entities=entities, morphology=morphology, | ||||
|                                         heads=heads, deps=deps, entities=entities, morphs=morphs, | ||||
|                                         brackets=[]) | ||||
| 
 | ||||
|             for i, gold_i in enumerate(self.cand_to_gold): | ||||
|  | @ -1000,12 +1057,12 @@ cdef class GoldParse: | |||
|                     self.heads[i] = None | ||||
|                     self.labels[i] = None | ||||
|                     self.ner[i] = None | ||||
|                     self.morphology[i] = set() | ||||
|                     self.morphs[i] = set() | ||||
|                 if gold_i is None: | ||||
|                     if i in i2j_multi: | ||||
|                         self.words[i] = words[i2j_multi[i]] | ||||
|                         self.tags[i] = tags[i2j_multi[i]] | ||||
|                         self.morphology[i] = morphology[i2j_multi[i]] | ||||
|                         self.morphs[i] = morphs[i2j_multi[i]] | ||||
|                         is_last = i2j_multi[i] != i2j_multi.get(i+1) | ||||
|                         is_first = i2j_multi[i] != i2j_multi.get(i-1) | ||||
|                         # Set next word in multi-token span as head, until last | ||||
|  | @ -1044,7 +1101,7 @@ cdef class GoldParse: | |||
|                 else: | ||||
|                     self.words[i] = words[gold_i] | ||||
|                     self.tags[i] = tags[gold_i] | ||||
|                     self.morphology[i] = morphology[gold_i] | ||||
|                     self.morphs[i] = morphs[gold_i] | ||||
|                     if heads[gold_i] is None: | ||||
|                         self.heads[i] = None | ||||
|                     else: | ||||
|  |  | |||
|  | @ -574,9 +574,8 @@ class Language(object): | |||
|         # Populate vocab | ||||
|         else: | ||||
|             for example in get_examples(): | ||||
|                 for token_annotation in example.token_annotations: | ||||
|                     for word in token_annotation.words: | ||||
|                         _ = self.vocab[word]  # noqa: F841 | ||||
|                 for word in example.token_annotation.words: | ||||
|                     _ = self.vocab[word]  # noqa: F841 | ||||
| 
 | ||||
|         if cfg.get("device", -1) >= 0: | ||||
|             util.use_gpu(cfg["device"]) | ||||
|  |  | |||
|  | @ -565,12 +565,11 @@ class Tagger(Pipe): | |||
|         orig_tag_map = dict(self.vocab.morphology.tag_map) | ||||
|         new_tag_map = OrderedDict() | ||||
|         for example in get_examples(): | ||||
|             for token_annotation in example.token_annotations: | ||||
|                 for tag in token_annotation.tags: | ||||
|                     if tag in orig_tag_map: | ||||
|                         new_tag_map[tag] = orig_tag_map[tag] | ||||
|                     else: | ||||
|                         new_tag_map[tag] = {POS: X} | ||||
|             for tag in example.token_annotation.tags: | ||||
|                 if tag in orig_tag_map: | ||||
|                     new_tag_map[tag] = orig_tag_map[tag] | ||||
|                 else: | ||||
|                     new_tag_map[tag] = {POS: X} | ||||
|         cdef Vocab vocab = self.vocab | ||||
|         if new_tag_map: | ||||
|             vocab.morphology = Morphology(vocab.strings, new_tag_map, | ||||
|  | @ -750,11 +749,10 @@ class MultitaskObjective(Tagger): | |||
|         gold_examples = nonproj.preprocess_training_data(get_examples()) | ||||
|         # for raw_text, doc_annot in gold_tuples: | ||||
|         for example in gold_examples: | ||||
|             for token_annotation in example.token_annotations: | ||||
|                 for i in range(len(token_annotation.ids)): | ||||
|                     label = self.make_label(i, token_annotation) | ||||
|                     if label is not None and label not in self.labels: | ||||
|                         self.labels[label] = len(self.labels) | ||||
|             for i in range(len(example.token_annotation.ids)): | ||||
|                 label = self.make_label(i, example.token_annotation) | ||||
|                 if label is not None and label not in self.labels: | ||||
|                     self.labels[label] = len(self.labels) | ||||
|         if self.model is True: | ||||
|             token_vector_width = util.env_opt("token_vector_width") | ||||
|             self.model = self.Model(len(self.labels), tok2vec=tok2vec) | ||||
|  |  | |||
|  | @ -237,7 +237,7 @@ class Scorer(object): | |||
|         if len(doc) != len(gold): | ||||
|             doc_annotation = DocAnnotation(cats=gold.cats) | ||||
|             token_annotation = gold.orig | ||||
|             gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation]) | ||||
|             gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) | ||||
|         orig = gold.orig | ||||
|         gold_deps = set() | ||||
|         gold_deps_per_dep = {} | ||||
|  |  | |||
|  | @ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem): | |||
|             actions[RIGHT][label] = 1 | ||||
|             actions[REDUCE][label] = 1 | ||||
|         for example in kwargs.get('gold_parses', []): | ||||
|             for token_annotation in example.token_annotations: | ||||
|                 heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps) | ||||
|                 for child, head, label in zip(token_annotation.ids, heads, labels): | ||||
|                     if label.upper() == 'ROOT' : | ||||
|                         label = 'ROOT' | ||||
|                     if head == child: | ||||
|                         actions[BREAK][label] += 1 | ||||
|                     elif head < child: | ||||
|                         actions[RIGHT][label] += 1 | ||||
|                         actions[REDUCE][''] += 1 | ||||
|                     elif head > child: | ||||
|                         actions[LEFT][label] += 1 | ||||
|                         actions[SHIFT][''] += 1 | ||||
|             heads, labels = nonproj.projectivize(example.token_annotation.heads, | ||||
|                                                  example.token_annotation.deps) | ||||
|             for child, head, label in zip(example.token_annotation.ids, heads, labels): | ||||
|                 if label.upper() == 'ROOT' : | ||||
|                     label = 'ROOT' | ||||
|                 if head == child: | ||||
|                     actions[BREAK][label] += 1 | ||||
|                 elif head < child: | ||||
|                     actions[RIGHT][label] += 1 | ||||
|                     actions[REDUCE][''] += 1 | ||||
|                 elif head > child: | ||||
|                     actions[LEFT][label] += 1 | ||||
|                     actions[SHIFT][''] += 1 | ||||
|         if min_freq is not None: | ||||
|             for action, label_freqs in actions.items(): | ||||
|                 for label, freq in list(label_freqs.items()): | ||||
|  |  | |||
|  | @ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|                 actions[action][entity_type] = 1 | ||||
|         moves = ('M', 'B', 'I', 'L', 'U') | ||||
|         for example in kwargs.get('gold_parses', []): | ||||
|             for token_annotation in example.token_annotations: | ||||
|                 for i, ner_tag in enumerate(token_annotation.entities): | ||||
|                     if ner_tag != 'O' and ner_tag != '-': | ||||
|                         _, label = ner_tag.split('-', 1) | ||||
|                         for action in (BEGIN, IN, LAST, UNIT): | ||||
|                             actions[action][label] += 1 | ||||
|             for i, ner_tag in enumerate(example.token_annotation.entities): | ||||
|                 if ner_tag != 'O' and ner_tag != '-': | ||||
|                     _, label = ner_tag.split('-', 1) | ||||
|                     for action in (BEGIN, IN, LAST, UNIT): | ||||
|                         actions[action][label] += 1 | ||||
|         return actions | ||||
| 
 | ||||
|     @property | ||||
|  |  | |||
|  | @ -81,15 +81,15 @@ def is_decorated(label): | |||
| def count_decorated_labels(gold_data): | ||||
|     freqs = {} | ||||
|     for example in gold_data: | ||||
|         for token_annotation in example.token_annotations: | ||||
|             proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) | ||||
|             # set the label to ROOT for each root dependent | ||||
|             deco_deps = ['ROOT' if head == i else deco_deps[i] | ||||
|                            for i, head in enumerate(proj_heads)] | ||||
|             # count label frequencies | ||||
|             for label in deco_deps: | ||||
|                 if is_decorated(label): | ||||
|                     freqs[label] = freqs.get(label, 0) + 1 | ||||
|         proj_heads, deco_deps = projectivize(example.token_annotation.heads, | ||||
|                                              example.token_annotation.deps) | ||||
|         # set the label to ROOT for each root dependent | ||||
|         deco_deps = ['ROOT' if head == i else deco_deps[i] | ||||
|                        for i, head in enumerate(proj_heads)] | ||||
|         # count label frequencies | ||||
|         for label in deco_deps: | ||||
|             if is_decorated(label): | ||||
|                 freqs[label] = freqs.get(label, 0) + 1 | ||||
|     return freqs | ||||
| 
 | ||||
| 
 | ||||
|  | @ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30): | |||
|     freqs = {} | ||||
|     for example in gold_data: | ||||
|         new_example = Example(doc=example.doc) | ||||
|         for token_annotation in example.token_annotations: | ||||
|             proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) | ||||
|             # set the label to ROOT for each root dependent | ||||
|             deco_deps = ['ROOT' if head == i else deco_deps[i] | ||||
|                            for i, head in enumerate(proj_heads)] | ||||
|             # count label frequencies | ||||
|             if label_freq_cutoff > 0: | ||||
|                 for label in deco_deps: | ||||
|                     if is_decorated(label): | ||||
|                         freqs[label] = freqs.get(label, 0) + 1 | ||||
|             # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? | ||||
|             proj_token_dict = token_annotation.to_dict() | ||||
|             proj_token_dict["heads"] = proj_heads | ||||
|             proj_token_dict["deps"] = deco_deps | ||||
|             new_example.add_token_annotation(**proj_token_dict) | ||||
|         proj_heads, deco_deps = projectivize(example.token_annotation.heads, | ||||
|                                              example.token_annotation.deps) | ||||
|         # set the label to ROOT for each root dependent | ||||
|         deco_deps = ['ROOT' if head == i else deco_deps[i] | ||||
|                        for i, head in enumerate(proj_heads)] | ||||
|         # count label frequencies | ||||
|         if label_freq_cutoff > 0: | ||||
|             for label in deco_deps: | ||||
|                 if is_decorated(label): | ||||
|                     freqs[label] = freqs.get(label, 0) + 1 | ||||
|         proj_token_dict = example.token_annotation.to_dict() | ||||
|         proj_token_dict["heads"] = proj_heads | ||||
|         proj_token_dict["deps"] = deco_deps | ||||
|         new_example.set_token_annotation(**proj_token_dict) | ||||
|         preprocessed.append(new_example) | ||||
|     if label_freq_cutoff > 0: | ||||
|         return _filter_labels(preprocessed, label_freq_cutoff, freqs) | ||||
|  | @ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs): | |||
|     filtered = [] | ||||
|     for example in examples: | ||||
|         new_example = Example(doc=example.doc) | ||||
|         for token_annotation in example.token_annotations: | ||||
|             filtered_labels = [] | ||||
|             for label in token_annotation.deps: | ||||
|                 if is_decorated(label) and freqs.get(label, 0) < cutoff: | ||||
|                     filtered_labels.append(decompose(label)[0]) | ||||
|                 else: | ||||
|                     filtered_labels.append(label) | ||||
|             filtered_token_dict = token_annotation.to_dict() | ||||
|             filtered_token_dict["deps"] = filtered_labels | ||||
|             new_example.add_token_annotation(**filtered_token_dict) | ||||
|         filtered_labels = [] | ||||
|         for label in example.token_annotation.deps: | ||||
|             if is_decorated(label) and freqs.get(label, 0) < cutoff: | ||||
|                 filtered_labels.append(decompose(label)[0]) | ||||
|             else: | ||||
|                 filtered_labels.append(label) | ||||
|         filtered_token_dict = example.token_annotation.to_dict() | ||||
|         filtered_token_dict["deps"] = filtered_labels | ||||
|         new_example.set_token_annotation(**filtered_token_dict) | ||||
|         filtered.append(new_example) | ||||
|     return filtered | ||||
|  |  | |||
|  | @ -273,7 +273,7 @@ def test_issue1963(en_tokenizer): | |||
| def test_issue1967(label): | ||||
|     ner = EntityRecognizer(Vocab()) | ||||
|     example = Example(doc=None) | ||||
|     example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) | ||||
|     example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) | ||||
|     ner.moves.get_actions(gold_parses=[example]) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -36,6 +36,16 @@ def doc(): | |||
|     return doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def merged_dict(): | ||||
|     return { | ||||
|         "ids": [1, 2, 3, 4, 5, 6, 7], | ||||
|         "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], | ||||
|         "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], | ||||
|         "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
| def test_gold_biluo_U(en_vocab): | ||||
|     words = ["I", "flew", "to", "London", "."] | ||||
|     spaces = [True, True, True, False, True] | ||||
|  | @ -231,7 +241,7 @@ def test_ignore_misaligned(doc): | |||
|     deps = [t.dep_ for t in doc] | ||||
|     heads = [t.head.i for t in doc] | ||||
| 
 | ||||
|     use_new_align = spacy.gold.USE_NEW_ALIGN | ||||
|     saved_use_new_align = spacy.gold.USE_NEW_ALIGN | ||||
| 
 | ||||
|     spacy.gold.USE_NEW_ALIGN = False | ||||
|     with make_tempdir() as tmpdir: | ||||
|  | @ -270,7 +280,25 @@ def test_ignore_misaligned(doc): | |||
|                                   ignore_misaligned=True)) | ||||
|     assert len(train_reloaded_example) == 0 | ||||
| 
 | ||||
|     spacy.gold.USE_NEW_ALIGN = use_new_align | ||||
|     spacy.gold.USE_NEW_ALIGN = saved_use_new_align | ||||
| 
 | ||||
| 
 | ||||
| def test_make_orth_variants(doc): | ||||
|     nlp = English() | ||||
|     text = doc.text | ||||
|     deps = [t.dep_ for t in doc] | ||||
|     heads = [t.head.i for t in doc] | ||||
| 
 | ||||
|     with make_tempdir() as tmpdir: | ||||
|         jsonl_file = tmpdir / "test.jsonl" | ||||
|         # write to JSONL train dicts | ||||
|         srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) | ||||
|         goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) | ||||
| 
 | ||||
|     # due to randomness, test only that this runs with no errors for now | ||||
|     train_reloaded_example = next(goldcorpus.train_dataset(nlp, | ||||
|         orth_variant_level=0.2)) | ||||
|     train_goldparse = train_reloaded_example.gold | ||||
| 
 | ||||
| 
 | ||||
| # xfail while we have backwards-compatible alignment | ||||
|  | @ -386,71 +414,38 @@ def _train(train_data): | |||
|             nlp.update(batch, sgd=optimizer, losses=losses) | ||||
| 
 | ||||
| 
 | ||||
| tokens_1 = { | ||||
|     "ids": [1, 2, 3], | ||||
|     "words": ["Hi", "there", "everyone"], | ||||
|     "tags": ["INTJ", "ADV", "PRON"], | ||||
| } | ||||
| 
 | ||||
| tokens_2 = { | ||||
|     "ids": [1, 2, 3, 4], | ||||
|     "words": ["It", "is", "just", "me"], | ||||
|     "tags": ["PRON", "AUX", "ADV", "PRON"], | ||||
| } | ||||
| 
 | ||||
| text0 = "Hi there everyone It is just me" | ||||
| 
 | ||||
| 
 | ||||
| def test_merge_sents(): | ||||
| def test_split_sents(merged_dict): | ||||
|     nlp = English() | ||||
|     example = Example() | ||||
|     example.add_token_annotation(**tokens_1) | ||||
|     example.add_token_annotation(**tokens_2) | ||||
|     example.set_token_annotation(**merged_dict) | ||||
|     assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 | ||||
|     assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1   # this shouldn't change the original object | ||||
|     assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 | ||||
| 
 | ||||
|     merged_example = example.merge_sents() | ||||
|     split_examples = example.split_sents() | ||||
|     assert len(split_examples) == 2 | ||||
| 
 | ||||
|     token_annotation_1 = example.token_annotations[0] | ||||
|     token_annotation_1 = split_examples[0].token_annotation | ||||
|     assert token_annotation_1.ids == [1, 2, 3] | ||||
|     assert token_annotation_1.words == ["Hi", "there", "everyone"] | ||||
|     assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] | ||||
|     assert token_annotation_1.sent_starts == [1, 0, 0] | ||||
| 
 | ||||
|     token_annotation_m = merged_example.token_annotations[0] | ||||
|     assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7] | ||||
|     assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"] | ||||
|     assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"] | ||||
|     token_annotation_2 = split_examples[1].token_annotation | ||||
|     assert token_annotation_2.ids == [4, 5, 6, 7] | ||||
|     assert token_annotation_2.words == ["It", "is", "just", "me"] | ||||
|     assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] | ||||
|     assert token_annotation_2.sent_starts == [1, 0, 0, 0] | ||||
| 
 | ||||
| 
 | ||||
| def test_tuples_to_example(): | ||||
| def test_tuples_to_example(merged_dict): | ||||
|     ex = Example() | ||||
|     ex.add_token_annotation(**tokens_1) | ||||
|     ex.add_token_annotation(**tokens_2) | ||||
|     ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0}) | ||||
|     ex.set_token_annotation(**merged_dict) | ||||
|     cats = {"TRAVEL": 1.0, "BAKING": 0.0} | ||||
|     ex.set_doc_annotation(cats=cats) | ||||
|     ex_dict = ex.to_dict() | ||||
| 
 | ||||
|     token_dicts = [ | ||||
|         { | ||||
|             "ids": [1, 2, 3], | ||||
|             "words": ["Hi", "there", "everyone"], | ||||
|             "tags": ["INTJ", "ADV", "PRON"], | ||||
|             "heads": [], | ||||
|             "deps": [], | ||||
|             "entities": [], | ||||
|             "morphology": [], | ||||
|             "brackets": [], | ||||
|         }, | ||||
|         { | ||||
|             "ids": [1, 2, 3, 4], | ||||
|             "words": ["It", "is", "just", "me"], | ||||
|             "tags": ["PRON", "AUX", "ADV", "PRON"], | ||||
|             "heads": [], | ||||
|             "deps": [], | ||||
|             "entities": [], | ||||
|             "morphology": [], | ||||
|             "brackets": [], | ||||
|         }, | ||||
|     ] | ||||
|     doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}} | ||||
| 
 | ||||
|     assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict} | ||||
|     assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] | ||||
|     assert ex_dict["token_annotation"]["words"] == merged_dict["words"] | ||||
|     assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] | ||||
|     assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] | ||||
|     assert ex_dict["doc_annotation"]["cats"] == cats | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ def test_ner_per_type(en_vocab): | |||
|             ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], | ||||
|         ) | ||||
|         ex = Example(doc=doc) | ||||
|         ex.add_token_annotation(entities=annot["entities"]) | ||||
|         ex.set_token_annotation(entities=annot["entities"]) | ||||
|         scorer.score(ex) | ||||
|     results = scorer.scores | ||||
| 
 | ||||
|  | @ -107,7 +107,7 @@ def test_ner_per_type(en_vocab): | |||
|             ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], | ||||
|         ) | ||||
|         ex = Example(doc=doc) | ||||
|         ex.add_token_annotation(entities=annot["entities"]) | ||||
|         ex.set_token_annotation(entities=annot["entities"]) | ||||
|         scorer.score(ex) | ||||
|     results = scorer.scores | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user