diff --git a/spacy/_gold/augment.py b/spacy/_gold/augment.py index 02c812825..1fffe6187 100644 --- a/spacy/_gold/augment.py +++ b/spacy/_gold/augment.py @@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): punct_choices = [random.choice(x["variants"]) for x in ndsv] for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: + if ( + tags[word_idx] in ndsv[punct_idx]["tags"] + and words[word_idx] in ndsv[punct_idx]["variants"] + ): words[word_idx] = punct_choices[punct_idx] # paired variants punct_choices = [random.choice(x["variants"]) for x in ndpv] for word_idx in range(len(words)): for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ + word_idx + ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): # backup option: random left vs. right from pair pair_idx = random.choice([0, 1]) # best option: rely on paired POS tags like `` / '' @@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): for single_variants in ndsv: variants.extend(single_variants["variants"]) for paired_variants in ndpv: - variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) + variants.extend( + list(itertools.chain.from_iterable(paired_variants["variants"])) + ) # store variants in reverse length order to be able to prioritize # longer matches (e.g., "---" before "--") variants = sorted(variants, key=lambda x: len(x)) @@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0): # add variant word else: for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): + if not match_found and raw[raw_idx:].startswith(variant): raw_idx += len(variant) variant_raw += word match_found = True diff --git a/spacy/_gold/example.py b/spacy/_gold/example.py index 969ba0374..c637c5540 100644 --- a/spacy/_gold/example.py +++ b/spacy/_gold/example.py @@ -1,17 +1,21 @@ from .annotation import TokenAnnotation, DocAnnotation from ..errors import Errors, AlignmentError from ..tokens import Doc + # We're hoping to kill this GoldParse dependency but for now match semantics. from ..syntax.gold_parse import GoldParse class Example: - def __init__(self, doc_annotation=None, token_annotation=None, doc=None, - goldparse=None): + def __init__( + self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None + ): """ Doc can either be text, or an actual Doc """ self.doc = doc self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotation = token_annotation if token_annotation else TokenAnnotation() + self.token_annotation = ( + token_annotation if token_annotation else TokenAnnotation() + ) self.goldparse = goldparse @classmethod @@ -49,13 +53,33 @@ class Example: self.goldparse = gold return self.goldparse - def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, - morphs=None, lemmas=None, heads=None, deps=None, - entities=None, sent_starts=None, brackets=None): - self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=deps, entities=entities, - sent_starts=sent_starts, brackets=brackets) + def set_token_annotation( + self, + ids=None, + words=None, + tags=None, + pos=None, + morphs=None, + lemmas=None, + heads=None, + deps=None, + entities=None, + sent_starts=None, + brackets=None, + ): + self.token_annotation = TokenAnnotation( + ids=ids, + words=words, + tags=tags, + pos=pos, + morphs=morphs, + lemmas=lemmas, + heads=heads, + deps=deps, + entities=entities, + sent_starts=sent_starts, + brackets=brackets, + ) def set_doc_annotation(self, cats=None, links=None): if cats: @@ -77,11 +101,19 @@ class Example: split_examples = [] for i in range(len(t.words)): if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation(ids=s_ids, - words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, - lemmas=s_lemmas, heads=s_heads, deps=s_deps, - entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) + s_example.set_token_annotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ) split_examples.append(s_example) s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] @@ -99,20 +131,27 @@ class Example: s_ents.append(t.get_entity(i)) s_sent_starts.append(t.get_sent_start(i)) for b_end, b_label in t.brackets_by_start.get(i, []): - s_brackets.append( - (i - sent_start_i, b_end - sent_start_i, b_label) - ) + s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label)) i += 1 - s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, - pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, - deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) + s_example.set_token_annotation( + ids=s_ids, + words=s_words, + tags=s_tags, + pos=s_pos, + morphs=s_morphs, + lemmas=s_lemmas, + heads=s_heads, + deps=s_deps, + entities=s_ents, + sent_starts=s_sent_starts, + brackets=s_brackets, + ) split_examples.append(s_example) return split_examples - - def get_gold_parses(self, merge=True, vocab=None, make_projective=False, - ignore_misaligned=False): + def get_gold_parses( + self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False + ): """Return a list of (doc, GoldParse) objects. If merge is set to True, keep all Token annotations as one big list.""" d = self.doc_annotation @@ -125,8 +164,9 @@ class Example: raise ValueError(Errors.E998) doc = Doc(vocab, words=t.words) try: - gp = GoldParse.from_annotation(doc, d, t, - make_projective=make_projective) + gp = GoldParse.from_annotation( + doc, d, t, make_projective=make_projective + ) except AlignmentError: if ignore_misaligned: gp = None @@ -143,9 +183,12 @@ class Example: raise ValueError(Errors.E998) split_doc = Doc(vocab, words=split_example.token_annotation.words) try: - gp = GoldParse.from_annotation(split_doc, d, - split_example.token_annotation, - make_projective=make_projective) + gp = GoldParse.from_annotation( + split_doc, + d, + split_example.token_annotation, + make_projective=make_projective, + ) except AlignmentError: if ignore_misaligned: gp = None @@ -194,7 +237,9 @@ class Example: else: gold = None if gold is not None: - converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) + converted_examples.append( + Example.from_gold(goldparse=gold, doc=doc) + ) else: raise ValueError(Errors.E999.format(gold_dict=gold_dict)) else: