mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 06:30:35 +03:00
Format
This commit is contained in:
parent
0f9b4bbfea
commit
17533a9286
|
@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndsv)):
|
||||
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
||||
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
||||
if (
|
||||
tags[word_idx] in ndsv[punct_idx]["tags"]
|
||||
and words[word_idx] in ndsv[punct_idx]["variants"]
|
||||
):
|
||||
words[word_idx] = punct_choices[punct_idx]
|
||||
# paired variants
|
||||
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndpv)):
|
||||
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
||||
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
|
||||
word_idx
|
||||
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||
# backup option: random left vs. right from pair
|
||||
pair_idx = random.choice([0, 1])
|
||||
# best option: rely on paired POS tags like `` / ''
|
||||
|
@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
for single_variants in ndsv:
|
||||
variants.extend(single_variants["variants"])
|
||||
for paired_variants in ndpv:
|
||||
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
|
||||
variants.extend(
|
||||
list(itertools.chain.from_iterable(paired_variants["variants"]))
|
||||
)
|
||||
# store variants in reverse length order to be able to prioritize
|
||||
# longer matches (e.g., "---" before "--")
|
||||
variants = sorted(variants, key=lambda x: len(x))
|
||||
|
@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
|||
# add variant word
|
||||
else:
|
||||
for variant in variants:
|
||||
if not match_found and \
|
||||
raw[raw_idx:].startswith(variant):
|
||||
if not match_found and raw[raw_idx:].startswith(variant):
|
||||
raw_idx += len(variant)
|
||||
variant_raw += word
|
||||
match_found = True
|
||||
|
|
|
@ -1,17 +1,21 @@
|
|||
from .annotation import TokenAnnotation, DocAnnotation
|
||||
from ..errors import Errors, AlignmentError
|
||||
from ..tokens import Doc
|
||||
|
||||
# We're hoping to kill this GoldParse dependency but for now match semantics.
|
||||
from ..syntax.gold_parse import GoldParse
|
||||
|
||||
|
||||
class Example:
|
||||
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
|
||||
goldparse=None):
|
||||
def __init__(
|
||||
self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None
|
||||
):
|
||||
""" Doc can either be text, or an actual Doc """
|
||||
self.doc = doc
|
||||
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||
self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
|
||||
self.token_annotation = (
|
||||
token_annotation if token_annotation else TokenAnnotation()
|
||||
)
|
||||
self.goldparse = goldparse
|
||||
|
||||
@classmethod
|
||||
|
@ -49,13 +53,33 @@ class Example:
|
|||
self.goldparse = gold
|
||||
return self.goldparse
|
||||
|
||||
def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
|
||||
morphs=None, lemmas=None, heads=None, deps=None,
|
||||
entities=None, sent_starts=None, brackets=None):
|
||||
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
||||
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
||||
deps=deps, entities=entities,
|
||||
sent_starts=sent_starts, brackets=brackets)
|
||||
def set_token_annotation(
|
||||
self,
|
||||
ids=None,
|
||||
words=None,
|
||||
tags=None,
|
||||
pos=None,
|
||||
morphs=None,
|
||||
lemmas=None,
|
||||
heads=None,
|
||||
deps=None,
|
||||
entities=None,
|
||||
sent_starts=None,
|
||||
brackets=None,
|
||||
):
|
||||
self.token_annotation = TokenAnnotation(
|
||||
ids=ids,
|
||||
words=words,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
lemmas=lemmas,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
entities=entities,
|
||||
sent_starts=sent_starts,
|
||||
brackets=brackets,
|
||||
)
|
||||
|
||||
def set_doc_annotation(self, cats=None, links=None):
|
||||
if cats:
|
||||
|
@ -77,11 +101,19 @@ class Example:
|
|||
split_examples = []
|
||||
for i in range(len(t.words)):
|
||||
if i > 0 and t.sent_starts[i] == 1:
|
||||
s_example.set_token_annotation(ids=s_ids,
|
||||
words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
|
||||
lemmas=s_lemmas, heads=s_heads, deps=s_deps,
|
||||
entities=s_ents, sent_starts=s_sent_starts,
|
||||
brackets=s_brackets)
|
||||
s_example.set_token_annotation(
|
||||
ids=s_ids,
|
||||
words=s_words,
|
||||
tags=s_tags,
|
||||
pos=s_pos,
|
||||
morphs=s_morphs,
|
||||
lemmas=s_lemmas,
|
||||
heads=s_heads,
|
||||
deps=s_deps,
|
||||
entities=s_ents,
|
||||
sent_starts=s_sent_starts,
|
||||
brackets=s_brackets,
|
||||
)
|
||||
split_examples.append(s_example)
|
||||
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
||||
|
@ -99,20 +131,27 @@ class Example:
|
|||
s_ents.append(t.get_entity(i))
|
||||
s_sent_starts.append(t.get_sent_start(i))
|
||||
for b_end, b_label in t.brackets_by_start.get(i, []):
|
||||
s_brackets.append(
|
||||
(i - sent_start_i, b_end - sent_start_i, b_label)
|
||||
)
|
||||
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
|
||||
i += 1
|
||||
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
||||
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
|
||||
deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
|
||||
brackets=s_brackets)
|
||||
s_example.set_token_annotation(
|
||||
ids=s_ids,
|
||||
words=s_words,
|
||||
tags=s_tags,
|
||||
pos=s_pos,
|
||||
morphs=s_morphs,
|
||||
lemmas=s_lemmas,
|
||||
heads=s_heads,
|
||||
deps=s_deps,
|
||||
entities=s_ents,
|
||||
sent_starts=s_sent_starts,
|
||||
brackets=s_brackets,
|
||||
)
|
||||
split_examples.append(s_example)
|
||||
return split_examples
|
||||
|
||||
|
||||
def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
|
||||
ignore_misaligned=False):
|
||||
def get_gold_parses(
|
||||
self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False
|
||||
):
|
||||
"""Return a list of (doc, GoldParse) objects.
|
||||
If merge is set to True, keep all Token annotations as one big list."""
|
||||
d = self.doc_annotation
|
||||
|
@ -125,8 +164,9 @@ class Example:
|
|||
raise ValueError(Errors.E998)
|
||||
doc = Doc(vocab, words=t.words)
|
||||
try:
|
||||
gp = GoldParse.from_annotation(doc, d, t,
|
||||
make_projective=make_projective)
|
||||
gp = GoldParse.from_annotation(
|
||||
doc, d, t, make_projective=make_projective
|
||||
)
|
||||
except AlignmentError:
|
||||
if ignore_misaligned:
|
||||
gp = None
|
||||
|
@ -143,9 +183,12 @@ class Example:
|
|||
raise ValueError(Errors.E998)
|
||||
split_doc = Doc(vocab, words=split_example.token_annotation.words)
|
||||
try:
|
||||
gp = GoldParse.from_annotation(split_doc, d,
|
||||
gp = GoldParse.from_annotation(
|
||||
split_doc,
|
||||
d,
|
||||
split_example.token_annotation,
|
||||
make_projective=make_projective)
|
||||
make_projective=make_projective,
|
||||
)
|
||||
except AlignmentError:
|
||||
if ignore_misaligned:
|
||||
gp = None
|
||||
|
@ -194,7 +237,9 @@ class Example:
|
|||
else:
|
||||
gold = None
|
||||
if gold is not None:
|
||||
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
|
||||
converted_examples.append(
|
||||
Example.from_gold(goldparse=gold, doc=doc)
|
||||
)
|
||||
else:
|
||||
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user