mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 06:30:35 +03:00
Format
This commit is contained in:
parent
0f9b4bbfea
commit
17533a9286
|
@ -32,15 +32,18 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||||
for word_idx in range(len(words)):
|
for word_idx in range(len(words)):
|
||||||
for punct_idx in range(len(ndsv)):
|
for punct_idx in range(len(ndsv)):
|
||||||
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
if (
|
||||||
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
tags[word_idx] in ndsv[punct_idx]["tags"]
|
||||||
|
and words[word_idx] in ndsv[punct_idx]["variants"]
|
||||||
|
):
|
||||||
words[word_idx] = punct_choices[punct_idx]
|
words[word_idx] = punct_choices[punct_idx]
|
||||||
# paired variants
|
# paired variants
|
||||||
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||||
for word_idx in range(len(words)):
|
for word_idx in range(len(words)):
|
||||||
for punct_idx in range(len(ndpv)):
|
for punct_idx in range(len(ndpv)):
|
||||||
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
|
||||||
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
word_idx
|
||||||
|
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||||
# backup option: random left vs. right from pair
|
# backup option: random left vs. right from pair
|
||||||
pair_idx = random.choice([0, 1])
|
pair_idx = random.choice([0, 1])
|
||||||
# best option: rely on paired POS tags like `` / ''
|
# best option: rely on paired POS tags like `` / ''
|
||||||
|
@ -64,7 +67,9 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
for single_variants in ndsv:
|
for single_variants in ndsv:
|
||||||
variants.extend(single_variants["variants"])
|
variants.extend(single_variants["variants"])
|
||||||
for paired_variants in ndpv:
|
for paired_variants in ndpv:
|
||||||
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
|
variants.extend(
|
||||||
|
list(itertools.chain.from_iterable(paired_variants["variants"]))
|
||||||
|
)
|
||||||
# store variants in reverse length order to be able to prioritize
|
# store variants in reverse length order to be able to prioritize
|
||||||
# longer matches (e.g., "---" before "--")
|
# longer matches (e.g., "---" before "--")
|
||||||
variants = sorted(variants, key=lambda x: len(x))
|
variants = sorted(variants, key=lambda x: len(x))
|
||||||
|
@ -88,8 +93,7 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
# add variant word
|
# add variant word
|
||||||
else:
|
else:
|
||||||
for variant in variants:
|
for variant in variants:
|
||||||
if not match_found and \
|
if not match_found and raw[raw_idx:].startswith(variant):
|
||||||
raw[raw_idx:].startswith(variant):
|
|
||||||
raw_idx += len(variant)
|
raw_idx += len(variant)
|
||||||
variant_raw += word
|
variant_raw += word
|
||||||
match_found = True
|
match_found = True
|
||||||
|
|
|
@ -1,17 +1,21 @@
|
||||||
from .annotation import TokenAnnotation, DocAnnotation
|
from .annotation import TokenAnnotation, DocAnnotation
|
||||||
from ..errors import Errors, AlignmentError
|
from ..errors import Errors, AlignmentError
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
# We're hoping to kill this GoldParse dependency but for now match semantics.
|
# We're hoping to kill this GoldParse dependency but for now match semantics.
|
||||||
from ..syntax.gold_parse import GoldParse
|
from ..syntax.gold_parse import GoldParse
|
||||||
|
|
||||||
|
|
||||||
class Example:
|
class Example:
|
||||||
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
|
def __init__(
|
||||||
goldparse=None):
|
self, doc_annotation=None, token_annotation=None, doc=None, goldparse=None
|
||||||
|
):
|
||||||
""" Doc can either be text, or an actual Doc """
|
""" Doc can either be text, or an actual Doc """
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||||
self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
|
self.token_annotation = (
|
||||||
|
token_annotation if token_annotation else TokenAnnotation()
|
||||||
|
)
|
||||||
self.goldparse = goldparse
|
self.goldparse = goldparse
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -49,13 +53,33 @@ class Example:
|
||||||
self.goldparse = gold
|
self.goldparse = gold
|
||||||
return self.goldparse
|
return self.goldparse
|
||||||
|
|
||||||
def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
|
def set_token_annotation(
|
||||||
morphs=None, lemmas=None, heads=None, deps=None,
|
self,
|
||||||
entities=None, sent_starts=None, brackets=None):
|
ids=None,
|
||||||
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
words=None,
|
||||||
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
tags=None,
|
||||||
deps=deps, entities=entities,
|
pos=None,
|
||||||
sent_starts=sent_starts, brackets=brackets)
|
morphs=None,
|
||||||
|
lemmas=None,
|
||||||
|
heads=None,
|
||||||
|
deps=None,
|
||||||
|
entities=None,
|
||||||
|
sent_starts=None,
|
||||||
|
brackets=None,
|
||||||
|
):
|
||||||
|
self.token_annotation = TokenAnnotation(
|
||||||
|
ids=ids,
|
||||||
|
words=words,
|
||||||
|
tags=tags,
|
||||||
|
pos=pos,
|
||||||
|
morphs=morphs,
|
||||||
|
lemmas=lemmas,
|
||||||
|
heads=heads,
|
||||||
|
deps=deps,
|
||||||
|
entities=entities,
|
||||||
|
sent_starts=sent_starts,
|
||||||
|
brackets=brackets,
|
||||||
|
)
|
||||||
|
|
||||||
def set_doc_annotation(self, cats=None, links=None):
|
def set_doc_annotation(self, cats=None, links=None):
|
||||||
if cats:
|
if cats:
|
||||||
|
@ -77,11 +101,19 @@ class Example:
|
||||||
split_examples = []
|
split_examples = []
|
||||||
for i in range(len(t.words)):
|
for i in range(len(t.words)):
|
||||||
if i > 0 and t.sent_starts[i] == 1:
|
if i > 0 and t.sent_starts[i] == 1:
|
||||||
s_example.set_token_annotation(ids=s_ids,
|
s_example.set_token_annotation(
|
||||||
words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
|
ids=s_ids,
|
||||||
lemmas=s_lemmas, heads=s_heads, deps=s_deps,
|
words=s_words,
|
||||||
entities=s_ents, sent_starts=s_sent_starts,
|
tags=s_tags,
|
||||||
brackets=s_brackets)
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
)
|
||||||
split_examples.append(s_example)
|
split_examples.append(s_example)
|
||||||
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||||
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
||||||
|
@ -99,20 +131,27 @@ class Example:
|
||||||
s_ents.append(t.get_entity(i))
|
s_ents.append(t.get_entity(i))
|
||||||
s_sent_starts.append(t.get_sent_start(i))
|
s_sent_starts.append(t.get_sent_start(i))
|
||||||
for b_end, b_label in t.brackets_by_start.get(i, []):
|
for b_end, b_label in t.brackets_by_start.get(i, []):
|
||||||
s_brackets.append(
|
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
|
||||||
(i - sent_start_i, b_end - sent_start_i, b_label)
|
|
||||||
)
|
|
||||||
i += 1
|
i += 1
|
||||||
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
s_example.set_token_annotation(
|
||||||
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
|
ids=s_ids,
|
||||||
deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
|
words=s_words,
|
||||||
brackets=s_brackets)
|
tags=s_tags,
|
||||||
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
)
|
||||||
split_examples.append(s_example)
|
split_examples.append(s_example)
|
||||||
return split_examples
|
return split_examples
|
||||||
|
|
||||||
|
def get_gold_parses(
|
||||||
def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
|
self, merge=True, vocab=None, make_projective=False, ignore_misaligned=False
|
||||||
ignore_misaligned=False):
|
):
|
||||||
"""Return a list of (doc, GoldParse) objects.
|
"""Return a list of (doc, GoldParse) objects.
|
||||||
If merge is set to True, keep all Token annotations as one big list."""
|
If merge is set to True, keep all Token annotations as one big list."""
|
||||||
d = self.doc_annotation
|
d = self.doc_annotation
|
||||||
|
@ -125,8 +164,9 @@ class Example:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
doc = Doc(vocab, words=t.words)
|
doc = Doc(vocab, words=t.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(doc, d, t,
|
gp = GoldParse.from_annotation(
|
||||||
make_projective=make_projective)
|
doc, d, t, make_projective=make_projective
|
||||||
|
)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
|
@ -143,9 +183,12 @@ class Example:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
split_doc = Doc(vocab, words=split_example.token_annotation.words)
|
split_doc = Doc(vocab, words=split_example.token_annotation.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(split_doc, d,
|
gp = GoldParse.from_annotation(
|
||||||
split_example.token_annotation,
|
split_doc,
|
||||||
make_projective=make_projective)
|
d,
|
||||||
|
split_example.token_annotation,
|
||||||
|
make_projective=make_projective,
|
||||||
|
)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
|
@ -194,7 +237,9 @@ class Example:
|
||||||
else:
|
else:
|
||||||
gold = None
|
gold = None
|
||||||
if gold is not None:
|
if gold is not None:
|
||||||
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
|
converted_examples.append(
|
||||||
|
Example.from_gold(goldparse=gold, doc=doc)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
|
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user