This commit is contained in:
Matthew Honnibal 2020-06-06 15:13:07 +02:00
parent 0f9b4bbfea
commit 17533a9286
2 changed files with 87 additions and 38 deletions

View File

@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
punct_choices = [random.choice(x["variants"]) for x in ndsv] punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)): for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)): for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \ if (
and words[word_idx] in ndsv[punct_idx]["variants"]: tags[word_idx] in ndsv[punct_idx]["tags"]
and words[word_idx] in ndsv[punct_idx]["variants"]
):
words[word_idx] = punct_choices[punct_idx] words[word_idx] = punct_choices[punct_idx]
# paired variants # paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv] punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)): for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)): for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \ if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): word_idx
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair # backup option: random left vs. right from pair
pair_idx = random.choice([0, 1]) pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / '' # best option: rely on paired POS tags like `` / ''
@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
for single_variants in ndsv: for single_variants in ndsv:
variants.extend(single_variants["variants"]) variants.extend(single_variants["variants"])
for paired_variants in ndpv: for paired_variants in ndpv:
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) variants.extend(
list(itertools.chain.from_iterable(paired_variants["variants"]))
)
# store variants in reverse length order to be able to prioritize # store variants in reverse length order to be able to prioritize
# longer matches (e.g., "---" before "--") # longer matches (e.g., "---" before "--")
variants = sorted(variants, key=lambda x: len(x)) variants = sorted(variants, key=lambda x: len(x))
@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
# add variant word # add variant word
else: else:
for variant in variants: for variant in variants:
if not match_found and \ if not match_found and raw[raw_idx:].startswith(variant):
raw[raw_idx:].startswith(variant):
raw_idx += len(variant) raw_idx += len(variant)
variant_raw += word variant_raw += word
match_found = True match_found = True

View File

@ -1,17 +1,21 @@
from .annotation import TokenAnnotation, DocAnnotation from .annotation import TokenAnnotation, DocAnnotation
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from ..tokens import Doc from ..tokens import Doc
# We're hoping to kill this GoldParse dependency but for now match semantics. # We're hoping to kill this GoldParse dependency but for now match semantics.
from ..syntax.gold_parse import GoldParse from ..syntax.gold_parse import GoldParse
class Example: class Example:
def __init__(self, doc_annotation=None, token_annotation=None, doc=None, def __init__(
goldparse=None): self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None
):
""" Doc can either be text, or an actual Doc """ """ Doc can either be text, or an actual Doc """
self.doc = doc self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotation = token_annotation if token_annotation else TokenAnnotation() self.token_annotation = (
token_annotation if token_annotation else TokenAnnotation()
)
self.goldparse = goldparse self.goldparse = goldparse
@classmethod @classmethod
@ -49,13 +53,33 @@ class Example:
self.goldparse = gold self.goldparse = gold
return self.goldparse return self.goldparse
def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, def set_token_annotation(
morphs=None, lemmas=None, heads=None, deps=None, self,
entities=None, sent_starts=None, brackets=None): ids=None,
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, words=None,
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, tags=None,
deps=deps, entities=entities, pos=None,
sent_starts=sent_starts, brackets=brackets) morphs=None,
lemmas=None,
heads=None,
deps=None,
entities=None,
sent_starts=None,
brackets=None,
):
self.token_annotation = TokenAnnotation(
ids=ids,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
lemmas=lemmas,
heads=heads,
deps=deps,
entities=entities,
sent_starts=sent_starts,
brackets=brackets,
)
def set_doc_annotation(self, cats=None, links=None): def set_doc_annotation(self, cats=None, links=None):
if cats: if cats:
@ -77,11 +101,19 @@ class Example:
split_examples = [] split_examples = []
for i in range(len(t.words)): for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1: if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids, s_example.set_token_annotation(
words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, ids=s_ids,
lemmas=s_lemmas, heads=s_heads, deps=s_deps, words=s_words,
entities=s_ents, sent_starts=s_sent_starts, tags=s_tags,
brackets=s_brackets) pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
)
split_examples.append(s_example) split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
@ -99,20 +131,27 @@ class Example:
s_ents.append(t.get_entity(i)) s_ents.append(t.get_entity(i))
s_sent_starts.append(t.get_sent_start(i)) s_sent_starts.append(t.get_sent_start(i))
for b_end, b_label in t.brackets_by_start.get(i, []): for b_end, b_label in t.brackets_by_start.get(i, []):
s_brackets.append( s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
(i - sent_start_i, b_end - sent_start_i, b_label)
)
i += 1 i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, s_example.set_token_annotation(
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, ids=s_ids,
deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, words=s_words,
brackets=s_brackets) tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
)
split_examples.append(s_example) split_examples.append(s_example)
return split_examples return split_examples
def get_gold_parses(
def get_gold_parses(self, merge=True, vocab=None, make_projective=False, self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False
ignore_misaligned=False): ):
"""Return a list of (doc, GoldParse) objects. """Return a list of (doc, GoldParse) objects.
If merge is set to True, keep all Token annotations as one big list.""" If merge is set to True, keep all Token annotations as one big list."""
d = self.doc_annotation d = self.doc_annotation
@ -125,8 +164,9 @@ class Example:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words) doc = Doc(vocab, words=t.words)
try: try:
gp = GoldParse.from_annotation(doc, d, t, gp = GoldParse.from_annotation(
make_projective=make_projective) doc, d, t, make_projective=make_projective
)
except AlignmentError: except AlignmentError:
if ignore_misaligned: if ignore_misaligned:
gp = None gp = None
@ -143,9 +183,12 @@ class Example:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
split_doc = Doc(vocab, words=split_example.token_annotation.words) split_doc = Doc(vocab, words=split_example.token_annotation.words)
try: try:
gp = GoldParse.from_annotation(split_doc, d, gp = GoldParse.from_annotation(
split_example.token_annotation, split_doc,
make_projective=make_projective) d,
split_example.token_annotation,
make_projective=make_projective,
)
except AlignmentError: except AlignmentError:
if ignore_misaligned: if ignore_misaligned:
gp = None gp = None
@ -194,7 +237,9 @@ class Example:
else: else:
gold = None gold = None
if gold is not None: if gold is not None:
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) converted_examples.append(
Example.from_gold(goldparse=gold, doc=doc)
)
else: else:
raise ValueError(Errors.E999.format(gold_dict=gold_dict)) raise ValueError(Errors.E999.format(gold_dict=gold_dict))
else: else: