This commit is contained in:
Matthew Honnibal 2020-06-22 01:11:43 +02:00
parent 455dc0d9e2
commit 6a75992af6
48 changed files with 593 additions and 287 deletions

View File

@ -134,7 +134,7 @@ def verify_cli_args(
merge_subtokens, merge_subtokens,
converter, converter,
ner_map, ner_map,
lang lang,
): ):
if converter == "ner" or converter == "iob": if converter == "ner" or converter == "iob":
input_data = input_path.open("r", encoding="utf-8").read() input_data = input_path.open("r", encoding="utf-8").read()
@ -148,7 +148,7 @@ def verify_cli_args(
else: else:
msg.warn( msg.warn(
"Can't automatically detect NER format. Conversion may not", "Can't automatically detect NER format. Conversion may not",
"succeed. See https://spacy.io/api/cli#convert" "succeed. See https://spacy.io/api/cli#convert",
) )
if file_type not in FILE_TYPES_STDOUT and output_dir == "-": if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly? # TODO: support msgpack via stdout in srsly?

View File

@ -178,7 +178,6 @@ def train_cli(
) )
def train( def train(
config_path, config_path,
data_paths, data_paths,
@ -238,8 +237,7 @@ def train(
tok2vec = tok2vec.get(subpath) tok2vec = tok2vec.get(subpath)
if not tok2vec: if not tok2vec:
msg.fail( msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.", f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
exits=1,
) )
tok2vec.from_bytes(weights_data) tok2vec.from_bytes(weights_data)
@ -351,7 +349,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try: try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e: except KeyError as e:
raise KeyError(Errors.E983.format(dict='score_weights', key=str(e), keys=list(scores.keys()))) raise KeyError(
Errors.E983.format(
dict="score_weights", key=str(e), keys=list(scores.keys())
)
)
scores["speed"] = wps scores["speed"] = wps
return weighted_score, scores return weighted_score, scores
@ -500,15 +502,23 @@ def setup_printer(training, nlp):
] ]
except KeyError as e: except KeyError as e:
raise KeyError( raise KeyError(
Errors.E983.format(dict='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) Errors.E983.format(
dict="scores (losses)", key=str(e), keys=list(info["losses"].keys())
)
)
try: try:
scores = [ scores = [
"{0:.2f}".format(float(info["other_scores"][col])) "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
for col in score_cols
] ]
except KeyError as e: except KeyError as e:
raise KeyError(Errors.E983.format(dict='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) raise KeyError(
Errors.E983.format(
dict="scores (other)",
key=str(e),
keys=list(info["other_scores"].keys()),
)
)
data = ( data = (
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
) )

View File

@ -5,7 +5,9 @@ import itertools
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
raw_text = example.text raw_text = example.text
orig_dict = example.to_dict() orig_dict = example.to_dict()
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level) variant_text, variant_token_annot = make_orth_variants(
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
)
doc = nlp.make_doc(variant_text) doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot orig_dict["token_annotation"] = variant_token_annot
return example.from_dict(doc, orig_dict) return example.from_dict(doc, orig_dict)

View File

@ -43,10 +43,7 @@ def conllu2json(
raw += example.text raw += example.text
sentences.append( sentences.append(
generate_sentence( generate_sentence(
example.to_dict(), example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
has_ner_tags,
MISC_NER_PATTERN,
ner_map=ner_map,
) )
) )
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the

View File

@ -8,6 +8,7 @@ from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
from ...lang.xx import MultiLanguage from ...lang.xx import MultiLanguage
@contextlib.contextmanager @contextlib.contextmanager
def make_tempdir(): def make_tempdir():
d = Path(tempfile.mkdtemp()) d = Path(tempfile.mkdtemp())
@ -15,11 +16,7 @@ def make_tempdir():
shutil.rmtree(str(d)) shutil.rmtree(str(d))
def json2docs( def json2docs(input_data, model=None, **kwargs):
input_data,
model=None,
**kwargs
):
nlp = load_model(model) if model is not None else MultiLanguage() nlp = load_model(model) if model is not None else MultiLanguage()
docs = [] docs = []
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
@ -29,10 +26,6 @@ def json2docs(
for json_annot in read_json_file(json_path): for json_annot in read_json_file(json_path):
example_dict = _fix_legacy_dict_data(json_annot) example_dict = _fix_legacy_dict_data(json_annot)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc( doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
nlp.vocab,
tok_dict,
doc_dict
)
docs.append(doc) docs.append(doc)
return docs return docs

View File

@ -12,6 +12,7 @@ class Corpus:
DOCS: https://spacy.io/api/goldcorpus DOCS: https://spacy.io/api/goldcorpus
""" """
def __init__(self, train_loc, dev_loc, limit=0): def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus. """Create a GoldCorpus.

View File

@ -54,7 +54,7 @@ def biluo_tags_from_doc(doc, missing="O"):
return biluo_tags_from_offsets( return biluo_tags_from_offsets(
doc, doc,
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
missing=missing missing=missing,
) )

View File

@ -542,7 +542,6 @@ class Language(object):
raise ValueError(Errors.E979.format(type=type(eg))) raise ValueError(Errors.E979.format(type=type(eg)))
return converted_examples return converted_examples
def update( def update(
self, self,
examples, examples,
@ -822,7 +821,7 @@ class Language(object):
batch_size=batch_size, batch_size=batch_size,
disable=disable, disable=disable,
n_process=n_process, n_process=n_process,
component_cfg=component_cfg component_cfg=component_cfg,
) )
for doc, context in zip(docs, contexts): for doc, context in zip(docs, contexts):
yield (doc, context) yield (doc, context)

View File

@ -51,7 +51,13 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {}) exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {}) rules_table = self.lookups.get_table("lemma_rules", {})
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn": if univ_pos == "propn":
return [string] return [string]
else: else:

View File

@ -14,7 +14,7 @@ def BILUO() -> Model[Padded, Padded]:
forward, forward,
init=init, init=init,
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": get_num_actions} attrs={"get_num_actions": get_num_actions},
) )

View File

@ -12,7 +12,7 @@ def IOB() -> Model[Padded, Padded]:
forward, forward,
init=init, init=init,
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": get_num_actions} attrs={"get_num_actions": get_num_actions},
) )

View File

@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
softmax = Softmax(nO=nO, nI=token_vector_width * 2) softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain( model = chain(
tok2vec, tok2vec,
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), Maxout(
nO=token_vector_width * 2,
nI=token_vector_width,
nP=maxout_pieces,
dropout=0.0,
),
LayerNorm(token_vector_width * 2), LayerNorm(token_vector_width * 2),
softmax, softmax,
) )
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
# nO = vocab.vectors.data.shape[1] # nO = vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
Maxout( Maxout(
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 nO=nO,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
normalize=True,
dropout=0.0,
), ),
Linear(nO=nO, nI=nO, init_W=zero_init), Linear(nO=nO, nI=nO, init_W=zero_init),
) )
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def mlm_forward(model, docs, is_train): def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop output, backprop = model.get_ref("wrapped-model").begin_update(
docs
) # drop=drop
def mlm_backward(d_output): def mlm_backward(d_output):
d_output *= 1 - mask d_output *= 1 - mask

View File

@ -16,18 +16,14 @@ def build_tb_parser_model(
nO=None, nO=None,
): ):
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain( tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
tok2vec,
with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine( lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO, nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens, nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"), nI=tok2vec.get_dim("nO"),
nP=maxout_pieces nP=maxout_pieces,
) )
if use_upper: if use_upper:
with use_ops("numpy"): with use_ops("numpy"):

View File

@ -1,6 +1,14 @@
import functools import functools
from typing import List, Tuple, Dict, Optional from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list from thinc.api import (
Ops,
Model,
Linear,
Softmax,
with_array,
softmax_activation,
padded2list,
)
from thinc.api import chain, list2padded, configure_normal_init from thinc.api import chain, list2padded, configure_normal_init
from thinc.api import Dropout from thinc.api import Dropout
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
@ -12,12 +20,12 @@ from ...util import registry
@registry.architectures.register("spacy.BiluoTagger.v1") @registry.architectures.register("spacy.BiluoTagger.v1")
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = BILUO() biluo = BILUO()
linear = Linear( linear = Linear(
nO=None, nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
nI=tok2vec.get_dim("nO"),
init_W=configure_normal_init(mean=0.02)
) )
model = chain( model = chain(
tok2vec, tok2vec,
@ -25,7 +33,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
with_array(chain(Dropout(0.1), linear)), with_array(chain(Dropout(0.1), linear)),
biluo, biluo,
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list() padded2list(),
) )
return Model( return Model(
@ -35,11 +43,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
layers=[model, linear], layers=[model, linear],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]} attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
) )
@registry.architectures.register("spacy.IOBTagger.v1") @registry.architectures.register("spacy.IOBTagger.v1")
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
biluo = IOB() biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain( model = chain(
@ -48,7 +59,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
with_array(linear), with_array(linear),
biluo, biluo,
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list() padded2list(),
) )
return Model( return Model(
@ -58,11 +69,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
layers=[model], layers=[model],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None}, dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]} attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
) )
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
if model.get_dim("nO") is None and Y: if model.get_dim("nO") is None and Y:
model.set_dim("nO", Y[0].shape[1]) model.set_dim("nO", Y[0].shape[1])

View File

@ -1,7 +1,30 @@
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention from thinc.api import (
Model,
reduce_mean,
Linear,
list2ragged,
Logistic,
ParametricAttention,
)
from thinc.api import chain, concatenate, clone, Dropout from thinc.api import chain, concatenate, clone, Dropout
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window from thinc.api import (
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor SparseLinear,
Softmax,
softmax_activation,
Maxout,
reduce_sum,
Relu,
residual,
expand_window,
)
from thinc.api import (
HashEmbed,
with_ragged,
with_array,
with_cpu,
uniqued,
FeatureExtractor,
)
from ..spacy_vectors import SpacyVectors from ..spacy_vectors import SpacyVectors
from ... import util from ... import util
@ -50,14 +73,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCat.v1") @registry.architectures.register("spacy.TextCat.v1")
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, def build_text_classifier(
window_size, conv_depth, dropout, nO=None): width,
embed_size,
pretrained_vectors,
exclusive_classes,
ngram_size,
window_size,
conv_depth,
dropout,
nO=None,
):
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) lower = HashEmbed(
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) )
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
)
suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array( trained_vectors = FeatureExtractor(cols) >> with_array(
@ -83,8 +123,15 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
vectors_width = width vectors_width = width
tok2vec = vector_layer >> with_array( tok2vec = vector_layer >> with_array(
Maxout(width, vectors_width, normalize=True) Maxout(width, vectors_width, normalize=True)
>> residual((expand_window(window_size=window_size) >> residual(
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, (
expand_window(window_size=window_size)
>> Maxout(
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
)
)
)
** conv_depth,
pad=conv_depth, pad=conv_depth,
) )
cnn_model = ( cnn_model = (
@ -98,15 +145,16 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
) )
linear_model = build_bow_text_classifier( linear_model = build_bow_text_classifier(
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False nO=nO,
ngram_size=ngram_size,
exclusive_classes=exclusive_classes,
no_output_layer=False,
) )
nO_double = nO * 2 if nO else None nO_double = nO * 2 if nO else None
if exclusive_classes: if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double) output_layer = Softmax(nO=nO, nI=nO_double)
else: else:
output_layer = ( output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
)
model = (linear_model | cnn_model) >> output_layer model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False: if model.has_dim("nO") is not False:

View File

@ -99,7 +99,13 @@ def hash_charembed_cnn(
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") @registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1( def hash_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout pretrained_vectors,
width,
depth,
embed_size,
subword_features,
maxout_pieces,
dropout,
): ):
# Does not use character embeddings: set to False by default # Does not use character embeddings: set to False by default
return build_Tok2Vec_model( return build_Tok2Vec_model(
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") @registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(width, maxout_pieces): def LayerNormalizedMaxout(width, maxout_pieces):
return Maxout( return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
nO=width,
nP=maxout_pieces,
dropout=0.0,
normalize=True,
)
@registry.architectures.register("spacy.MultiHashEmbed.v1") @registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): def MultiHashEmbed(
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
if use_subwords: if use_subwords:
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) prefix = HashEmbed(
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) )
suffix = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
)
if pretrained_vectors: if pretrained_vectors:
glove = StaticVectors( glove = StaticVectors(
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
cnn = chain( cnn = chain(
expand_window(window_size=window_size), expand_window(window_size=window_size),
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), Maxout(
nO=width,
nI=width * ((window_size * 2) + 1),
nP=maxout_pieces,
dropout=0.0,
normalize=True,
),
) )
model = clone(residual(cnn), depth) model = clone(residual(cnn), depth)
model.set_dim("nO", width) model.set_dim("nO", width)
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
subword_features = False subword_features = False
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) norm = HashEmbed(
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
)
if subword_features: if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) prefix = HashEmbed(
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) )
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
)
else: else:
prefix, suffix, shape = (None, None, None) prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None: if pretrained_vectors is not None:

View File

@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
attrs={ attrs={
"has_upper": has_upper, "has_upper": has_upper,
"unseen_classes": set(unseen_classes), "unseen_classes": set(unseen_classes),
"resize_output": resize_output "resize_output": resize_output,
} },
) )
@ -31,7 +31,7 @@ def forward(model, X, is_train):
model.layers, model.layers,
unseen_classes=model.attrs["unseen_classes"], unseen_classes=model.attrs["unseen_classes"],
train=is_train, train=is_train,
has_upper=model.attrs["has_upper"] has_upper=model.attrs["has_upper"],
) )
return step_model, step_model.finish_steps return step_model, step_model.finish_steps
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
nI = None nI = None
if smaller.has_dim("nI"): if smaller.has_dim("nI"):
nI = smaller.get_dim("nI") nI = smaller.get_dim("nI")
with use_ops('numpy'): with use_ops("numpy"):
larger = Linear(nO=new_nO, nI=nI) larger = Linear(nO=new_nO, nI=nI)
larger.init = smaller.init larger.init = smaller.init
# it could be that the model is not initialized yet, then skip this bit # it could be that the model is not initialized yet, then skip this bit

View File

@ -21,9 +21,7 @@ class SimpleNER(Pipe):
self.model = model self.model = model
self.cfg = {"labels": []} self.cfg = {"labels": []}
self.loss_func = SequenceCategoricalCrossentropy( self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), names=self.get_tag_names(), normalize=True, missing_value=None
normalize=True,
missing_value=None
) )
assert self.model is not None assert self.model is not None
@ -42,17 +40,17 @@ class SimpleNER(Pipe):
def get_tag_names(self): def get_tag_names(self):
if self.is_biluo: if self.is_biluo:
return ( return (
[f"B-{label}" for label in self.labels] + [f"B-{label}" for label in self.labels]
[f"I-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels]
[f"L-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels]
[f"U-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels]
["O"] + ["O"]
) )
else: else:
return ( return (
[f"B-{label}" for label in self.labels] + [f"B-{label}" for label in self.labels]
[f"I-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels]
["O"] + ["O"]
) )
def predict(self, docs: List[Doc]) -> List[Floats2d]: def predict(self, docs: List[Doc]) -> List[Floats2d]:
@ -107,7 +105,7 @@ class SimpleNER(Pipe):
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs) self.cfg.update(kwargs)
if not hasattr(get_examples, '__call__'): if not hasattr(get_examples, "__call__"):
gold_tuples = get_examples gold_tuples = get_examples
get_examples = lambda: gold_tuples get_examples = lambda: gold_tuples
labels = _get_labels(get_examples()) labels = _get_labels(get_examples())
@ -121,9 +119,7 @@ class SimpleNER(Pipe):
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.loss_func = SequenceCategoricalCrossentropy( self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), names=self.get_tag_names(), normalize=True, missing_value=None
normalize=True,
missing_value=None
) )
return sgd return sgd
@ -144,6 +140,6 @@ def _get_labels(examples):
labels = set() labels = set()
for eg in examples: for eg in examples:
for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True): for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
if ner_tag != 'O' and ner_tag != '-': if ner_tag != "O" and ner_tag != "-":
labels.add(ner_tag) labels.add(ner_tag)
return list(sorted(labels)) return list(sorted(labels))

View File

@ -97,7 +97,9 @@ class Scorer(object):
for name, component in pipeline: for name, component in pipeline:
if name == "textcat": if name == "textcat":
self.textcat_multilabel = component.model.attrs["multi_label"] self.textcat_multilabel = component.model.attrs["multi_label"]
self.textcat_positive_label = component.cfg.get("positive_label", None) self.textcat_positive_label = component.cfg.get(
"positive_label", None
)
for label in component.cfg.get("labels", []): for label in component.cfg.get("labels", []):
self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_auc_per_cat[label] = ROCAUCScore()
self.textcat_f_per_cat[label] = PRFScore() self.textcat_f_per_cat[label] = PRFScore()
@ -359,7 +361,9 @@ class Scorer(object):
(gold_i, gold_head, token.dep_.lower()) (gold_i, gold_head, token.dep_.lower())
) )
# Find all NER labels in gold and doc # Find all NER labels in gold and doc
ent_labels = set([k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]) ent_labels = set(
[k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
)
# Set up all labels for per type scoring and prepare gold per type # Set up all labels for per type scoring and prepare gold per type
gold_per_ents = {ent_label: set() for ent_label in ent_labels} gold_per_ents = {ent_label: set() for ent_label in ent_labels}
for ent_label in ent_labels: for ent_label in ent_labels:
@ -392,7 +396,10 @@ class Scorer(object):
self.pos.score_set(cand_pos, gold_pos) self.pos.score_set(cand_pos, gold_pos)
self.morphs.score_set(cand_morphs, gold_morphs) self.morphs.score_set(cand_morphs, gold_morphs)
for field in self.morphs_per_feat: for field in self.morphs_per_feat:
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) self.morphs_per_feat[field].score_set(
cand_morphs_per_feat.get(field, set()),
gold_morphs_per_feat.get(field, set()),
)
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep: for dep in self.labelled_per_dep:
@ -404,7 +411,9 @@ class Scorer(object):
) )
if ( if (
len(gold_doc.cats) > 0 len(gold_doc.cats) > 0
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold_doc.cats) and set(self.textcat_f_per_cat)
== set(self.textcat_auc_per_cat)
== set(gold_doc.cats)
and set(gold_doc.cats) == set(doc.cats) and set(gold_doc.cats) == set(doc.cats)
): ):
goldcat = max(gold_doc.cats, key=gold_doc.cats.get) goldcat = max(gold_doc.cats, key=gold_doc.cats.get)

View File

@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config) ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab): def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(en_vocab, default_ner(), **config) ner = EntityRecognizer(en_vocab, default_ner(), **config)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)

View File

@ -17,7 +17,12 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config) parser = DependencyParser(vocab, default_parser(), **config)
return parser return parser
@ -35,10 +40,7 @@ def _train_parser(parser):
for i in range(5): for i in range(5):
losses = {} losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = { gold = {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
"heads": [1, 1, 3, 3],
"deps": ["left", "ROOT", "left", "ROOT"]
}
example = Example.from_dict(doc, gold) example = Example.from_dict(doc, gold)
parser.update([example], sgd=sgd, losses=losses) parser.update([example], sgd=sgd, losses=losses)
return parser return parser
@ -51,10 +53,7 @@ def test_add_label(parser):
for i in range(100): for i in range(100):
losses = {} losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = { gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
"heads": [1, 1, 3, 3],
"deps": ["right", "ROOT", "left", "ROOT"]
}
parser.update((doc, gold), sgd=sgd, losses=losses) parser.update((doc, gold), sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc) doc = parser(doc)
@ -63,7 +62,12 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly(): def test_add_label_deserializes_correctly():
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")
@ -78,6 +82,7 @@ def test_add_label_deserializes_correctly():
for i in range(ner1.moves.n_moves): for i in range(ner1.moves.n_moves):
assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i) assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pipe_cls,n_moves,model", "pipe_cls,n_moves,model",
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())], [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],

View File

@ -139,7 +139,12 @@ def test_get_oracle_actions():
deps.append(dep) deps.append(dep)
ents.append(ent) ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(doc.vocab, default_parser(), **config) parser = DependencyParser(doc.vocab, default_parser(), **config)
parser.moves.add_action(0, "") parser.moves.add_action(0, "")
parser.moves.add_action(1, "") parser.moves.add_action(1, "")
@ -151,7 +156,9 @@ def test_get_oracle_actions():
parser.moves.add_action(2, dep) parser.moves.add_action(2, dep)
elif head < i: elif head < i:
parser.moves.add_action(3, dep) parser.moves.add_action(3, dep)
example = Example.from_dict(doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}) example = Example.from_dict(
doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}
)
parser.moves.get_oracle_sequence(example) parser.moves.get_oracle_sequence(example)

View File

@ -143,7 +143,12 @@ def test_accept_blocked_token():
# 1. test normal behaviour # 1. test normal behaviour
nlp1 = English() nlp1 = English()
doc1 = nlp1("I live in New York") doc1 = nlp1("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -162,7 +167,12 @@ def test_accept_blocked_token():
# 2. test blocking behaviour # 2. test blocking behaviour
nlp2 = English() nlp2 = English()
doc2 = nlp2("I live in New York") doc2 = nlp2("I live in New York")
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
# set "New York" to a blocked entity # set "New York" to a blocked entity
@ -220,7 +230,12 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O # Check that a new ner can overwrite O
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
ner2.moves.add_action(5, "") ner2.moves.add_action(5, "")
ner2.add_label("GPE") ner2.add_label("GPE")

View File

@ -29,7 +29,12 @@ def tok2vec():
@pytest.fixture @pytest.fixture
def parser(vocab, arc_eager): def parser(vocab, arc_eager):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
return Parser(vocab, model=default_parser(), moves=arc_eager, **config) return Parser(vocab, model=default_parser(), moves=arc_eager, **config)

View File

@ -180,6 +180,7 @@ def test_parser_set_sent_starts(en_vocab):
for token in sent: for token in sent:
assert token.head in sent assert token.head in sent
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
nlp = English() nlp = English()

View File

@ -16,7 +16,12 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(vocab, default_parser(), **config) parser = DependencyParser(vocab, default_parser(), **config)
parser.cfg["token_vector_width"] = 4 parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32
@ -28,7 +33,9 @@ def parser(vocab):
for i in range(10): for i in range(10):
losses = {} losses = {}
doc = Doc(vocab, words=["a", "b", "c", "d"]) doc = Doc(vocab, words=["a", "b", "c", "d"])
example = Example.from_dict(doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}) example = Example.from_dict(
doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
)
parser.update([example], sgd=sgd, losses=losses) parser.update([example], sgd=sgd, losses=losses)
return parser return parser

View File

@ -272,11 +272,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp.add_pipe(nlp.create_pipe("sentencizer"))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp) ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
@ -293,7 +295,11 @@ def test_overfitting_IO():
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})

View File

@ -15,8 +15,17 @@ def test_label_types():
TRAIN_DATA = [ TRAIN_DATA = [
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), (
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), "I like green eggs",
{
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
},
),
(
"Eat blue ham",
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
),
] ]
@ -38,7 +47,12 @@ def test_overfitting_IO():
# test the trained model # test the trained model
test_text = "I like blue eggs" test_text = "I like blue eggs"
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] gold_morphs = [
"Feat=N|POS=NOUN",
"Feat=V|POS=VERB",
"Feat=J|POS=ADJ",
"Feat=N|POS=NOUN",
]
assert [t.morph_ for t in doc] == gold_morphs assert [t.morph_ for t in doc] == gold_morphs
# Also test the results are still the same after IO # Also test the results are still the same after IO

View File

@ -7,24 +7,28 @@ from spacy.pipeline.simple_ner import SimpleNER
import spacy import spacy
@pytest.fixture(params=[ @pytest.fixture(
params=[
["PER", "ORG", "LOC", "MISC"], ["PER", "ORG", "LOC", "MISC"],
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
]) ]
)
def labels(request): def labels(request):
return request.param return request.param
@pytest.fixture @pytest.fixture
def ops(): def ops():
return NumpyOps() return NumpyOps()
def _get_actions(labels): def _get_actions(labels):
action_names = ( action_names = (
[f"B{label}" for label in labels] + \ [f"B{label}" for label in labels]
[f"I{label}" for label in labels] + \ + [f"I{label}" for label in labels]
[f"L{label}" for label in labels] + \ + [f"L{label}" for label in labels]
[f"U{label}" for label in labels] + \ + [f"U{label}" for label in labels]
["O"] + ["O"]
) )
A = namedtuple("actions", action_names) A = namedtuple("actions", action_names)
return A(**{name: i for i, name in enumerate(action_names)}) return A(**{name: i for i, name in enumerate(action_names)})

View File

@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(Vocab(), default_ner(), **config) ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example.from_dict( example = Example.from_dict(
Doc(ner.vocab, words=["word"]), Doc(ner.vocab, words=["word"]),
@ -280,8 +285,8 @@ def test_issue1967(label):
"tags": ["tag"], "tags": ["tag"],
"heads": [0], "heads": [0],
"deps": ["dep"], "deps": ["dep"],
"entities": [label] "entities": [label],
} },
) )
assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1] assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1]

View File

@ -196,7 +196,12 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(doc.vocab, default_ner(), **config) ner = EntityRecognizer(doc.vocab, default_ner(), **config)
# Add the OUT action. I wouldn't have thought this would be necessary... # Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "") ner.moves.add_action(5, "")

View File

@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok(): def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens""" """Test that the parser doesn't have subtok label if not learn_tokens"""
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config) parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok(): def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True.""" """Test that the parser does have subtok label if learn_tokens=True."""
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": True,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(Vocab(), default_parser(), **config) parser = DependencyParser(Vocab(), default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels

View File

@ -74,7 +74,12 @@ def test_issue4042_bug2():
output_dir.mkdir() output_dir.mkdir()
ner1.to_disk(output_dir) ner1.to_disk(output_dir)
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2 = EntityRecognizer(vocab, default_ner(), **config)
ner2.from_disk(output_dir) ner2.from_disk(output_dir)
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2

View File

@ -16,7 +16,12 @@ def test_issue4313():
beam_width = 16 beam_width = 16
beam_density = 0.0001 beam_density = 0.0001
nlp = English() nlp = English()
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
ner.begin_training([]) ner.begin_training([])

View File

@ -1,4 +1,5 @@
import pytest import pytest
# TODO # TODO
# from spacy.gold.converters.conllu2docs import conllu2docs # from spacy.gold.converters.conllu2docs import conllu2docs

View File

@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture @pytest.fixture
def parser(en_vocab): def parser(en_vocab):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 1,
"beam_update_prob": 1.0,
}
parser = DependencyParser(en_vocab, default_parser(), **config) parser = DependencyParser(en_vocab, default_parser(), **config)
parser.add_label("nsubj") parser.add_label("nsubj")
return parser return parser

View File

@ -36,7 +36,9 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
new_vocab1 = Vocab().from_bytes(vocab1_b) new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) assert sorted([s for s in new_vocab1.strings]) == sorted(
strings1 + list(default_strings)
)
@pytest.mark.parametrize("strings1,strings2", test_strings) @pytest.mark.parametrize("strings1,strings2", test_strings)

View File

@ -3,6 +3,7 @@ import pytest
from spacy.lang.en import English from spacy.lang.en import English
from spacy.gold.converters import iob2docs, conll_ner2docs from spacy.gold.converters import iob2docs, conll_ner2docs
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
# TODO # TODO
# from spacy.gold.converters import conllu2docs # from spacy.gold.converters import conllu2docs

View File

@ -155,7 +155,18 @@ def test_gold_biluo_misalign(en_vocab):
def test_split_sentences(en_vocab): def test_split_sentences(en_vocab):
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"] gold_words = [
"I",
"flew",
"to",
"San",
"Francisco",
"Valley",
"had",
"loads",
"of",
"fun",
]
sent_starts = [True, False, False, False, False, False, True, False, False, False] sent_starts = [True, False, False, False, False, False, True, False, False, False]
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
assert example.text == "I flew to San Francisco Valley had loads of fun " assert example.text == "I flew to San Francisco Valley had loads of fun "
@ -166,7 +177,16 @@ def test_split_sentences(en_vocab):
words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"] words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"] gold_words = [
"I",
"flew",
"to",
"San Francisco",
"Valley",
"had",
"loads of",
"fun",
]
sent_starts = [True, False, False, False, False, True, False, False] sent_starts = [True, False, False, False, False, True, False, False]
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
assert example.text == "I flew to San Francisco Valley had loads of fun " assert example.text == "I flew to San Francisco Valley had loads of fun "
@ -195,7 +215,15 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
gold_words = ["I", "flew to", "San Francisco Valley", "."] gold_words = ["I", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == [
"",
"",
"",
"LOC",
"LOC",
"LOC",
"",
]
# misaligned # misaligned
words = ["I flew", "to", "San Francisco", "Valley", "."] words = ["I flew", "to", "San Francisco", "Valley", "."]
@ -206,11 +234,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
entities = [(offset_start, offset_end, "LOC")] entities = [(offset_start, offset_end, "LOC")]
links = {(offset_start, offset_end): {"Q816843": 1.0}} links = {(offset_start, offset_end): {"Q816843": 1.0}}
gold_words = ["I", "flew to", "San", "Francisco Valley", "."] gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) example = Example.from_dict(
doc, {"words": gold_words, "entities": entities, "links": links}
)
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] assert example.get_aligned("ENT_KB_ID", as_string=True) == [
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0} "",
"",
"Q816843",
"Q816843",
"",
]
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
"Q816843": 1.0
}
# additional whitespace tokens in GoldParse words # additional whitespace tokens in GoldParse words
words, spaces = get_words_and_spaces( words, spaces = get_words_and_spaces(
@ -221,26 +259,55 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."] gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
gold_spaces = [True, True, False, True, False, False] gold_spaces = [True, True, False, True, False, False]
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) example = Example.from_dict(
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
)
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == [
"",
"",
"",
"",
"LOC",
"LOC",
"",
]
# from issue #4791 # from issue #4791
doc = en_tokenizer("I'll return the ₹54 amount") doc = en_tokenizer("I'll return the ₹54 amount")
gold_words = ["I", "'ll", "return", "the", "", "54", "amount"] gold_words = ["I", "'ll", "return", "the", "", "54", "amount"]
gold_spaces = [False, True, True, True, False, True, False] gold_spaces = [False, True, True, True, False, True, False]
entities = [(16, 19, "MONEY")] entities = [(16, 19, "MONEY")]
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) example = Example.from_dict(
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
)
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == [
"",
"",
"",
"",
"MONEY",
"",
]
doc = en_tokenizer("I'll return the $54 amount") doc = en_tokenizer("I'll return the $54 amount")
gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"] gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
gold_spaces = [False, True, True, True, False, True, False] gold_spaces = [False, True, True, True, False, True, False]
entities = [(16, 19, "MONEY")] entities = [(16, 19, "MONEY")]
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) example = Example.from_dict(
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
)
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == [
"",
"",
"",
"",
"MONEY",
"MONEY",
"",
]
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
@ -311,7 +378,9 @@ def test_roundtrip_docs_to_json(doc):
assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference]
assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents] assert ents == [
(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents
]
assert "TRAVEL" in reloaded_example.reference.cats assert "TRAVEL" in reloaded_example.reference.cats
assert "BAKING" in reloaded_example.reference.cats assert "BAKING" in reloaded_example.reference.cats
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
@ -375,7 +444,9 @@ def test_ignore_misaligned(doc):
# doesn't raise an AlignmentError, but there is nothing to iterate over # doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned # because the only example can't be aligned
train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) train_reloaded_example = list(
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
)
assert len(train_reloaded_example) == 0 assert len(train_reloaded_example) == 0
@ -389,7 +460,9 @@ def test_make_orth_variants(doc):
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
train_example = next(goldcorpus.train_dataset(nlp)) train_example = next(goldcorpus.train_dataset(nlp))
variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) variant_example = make_orth_variants_example(
nlp, train_example, orth_variant_level=0.2
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -430,7 +503,9 @@ def test_goldparse_startswith_space(en_tokenizer):
entities = ["U-DATE"] entities = ["U-DATE"]
deps = ["ROOT"] deps = ["ROOT"]
heads = [0] heads = [0]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "deps":deps, "heads": heads}) example = Example.from_dict(
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
)
assert example.get_aligned("ENT_IOB") == [None, 3] assert example.get_aligned("ENT_IOB") == [None, 3]
assert example.get_aligned("ENT_TYPE", as_string=True) == [None, "DATE"] assert example.get_aligned("ENT_TYPE", as_string=True) == [None, "DATE"]
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
@ -441,7 +516,12 @@ def test_gold_constructor():
nlp = English() nlp = English()
doc = nlp("This is a sentence") doc = nlp("This is a sentence")
example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}}) example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}})
assert example.get_aligned("ORTH", as_string=True) == ["This", "is", "a", "sentence"] assert example.get_aligned("ORTH", as_string=True) == [
"This",
"is",
"a",
"sentence",
]
assert example.reference.cats["cat1"] assert example.reference.cats["cat1"]
assert not example.reference.cats["cat2"] assert not example.reference.cats["cat2"]
@ -496,7 +576,7 @@ def test_split_sents(merged_dict):
nlp = English() nlp = English()
example = Example.from_dict( example = Example.from_dict(
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
merged_dict merged_dict,
) )
assert example.text == "Hi there everyone It is just me" assert example.text == "Hi there everyone It is just me"
@ -522,10 +602,7 @@ def test_tuples_to_example(vocab, merged_dict):
cats = {"TRAVEL": 1.0, "BAKING": 0.0} cats = {"TRAVEL": 1.0, "BAKING": 0.0}
merged_dict = dict(merged_dict) merged_dict = dict(merged_dict)
merged_dict["cats"] = cats merged_dict["cats"] = cats
ex = Example.from_dict( ex = Example.from_dict(Doc(vocab, words=merged_dict["words"]), merged_dict)
Doc(vocab, words=merged_dict["words"]),
merged_dict
)
words = [token.text for token in ex.reference] words = [token.text for token in ex.reference]
assert words == merged_dict["words"] assert words == merged_dict["words"]
tags = [token.tag_ for token in ex.reference] tags = [token.tag_ for token in ex.reference]

View File

@ -36,9 +36,7 @@ def test_language_update(nlp):
def test_language_evaluate(nlp): def test_language_evaluate(nlp):
text = "hello world" text = "hello world"
annots = { annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
}
doc = Doc(nlp.vocab, words=text.split(" ")) doc = Doc(nlp.vocab, words=text.split(" "))
# Evaluate with text and dict # Evaluate with text and dict
nlp.evaluate([(text, annots)]) nlp.evaluate([(text, annots)])

View File

@ -32,7 +32,9 @@ def test_Example_from_dict_invalid(annots):
Example.from_dict(predicted, annots) Example.from_dict(predicted, annots)
@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]) @pytest.mark.parametrize(
"pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]
)
@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}]) @pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
def test_Example_from_dict_with_tags(pred_words, annots): def test_Example_from_dict_with_tags(pred_words, annots):
vocab = Vocab() vocab = Vocab()
@ -161,7 +163,15 @@ def test_Example_from_dict_with_entities(annots):
example = Example.from_dict(predicted, annots) example = Example.from_dict(predicted, annots)
assert len(list(example.reference.ents)) == 2 assert len(list(example.reference.ents)) == 2
assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"] assert [example.reference[i].ent_iob_ for i in range(7)] == [
"O",
"O",
"B",
"I",
"O",
"B",
"O",
]
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
assert example.reference[2].ent_type_ == "LOC" assert example.reference[2].ent_type_ == "LOC"
@ -174,7 +184,10 @@ def test_Example_from_dict_with_entities(annots):
[ [
{ {
"words": ["I", "like", "New", "York", "and", "Berlin", "."], "words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(0, 4, "LOC"), (21, 27, "LOC")], # not aligned to token boundaries "entities": [
(0, 4, "LOC"),
(21, 27, "LOC"),
], # not aligned to token boundaries
} }
], ],
) )
@ -192,7 +205,10 @@ def test_Example_from_dict_with_entities_invalid(annots):
{ {
"words": ["I", "like", "New", "York", "and", "Berlin", "."], "words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")], "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}}, "links": {
(7, 15): {"Q60": 1.0, "Q64": 0.0},
(20, 26): {"Q60": 0.0, "Q64": 1.0},
},
} }
], ],
) )
@ -224,4 +240,3 @@ def test_Example_from_dict_with_links_invalid(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError): with pytest.raises(ValueError):
Example.from_dict(predicted, annots) Example.from_dict(predicted, annots)

View File

@ -42,6 +42,7 @@ test_ner_apple = [
] ]
] ]
@pytest.fixture @pytest.fixture
def tagged_doc(): def tagged_doc():
text = "Sarah's sister flew to Silicon Valley via London." text = "Sarah's sister flew to Silicon Valley via London."

View File

@ -26,7 +26,9 @@ def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes] docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2 tol = 0.2
batch_size = 1000 batch_size = 1000
batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)) batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol max_size = batch_size + batch_size * tol
@ -50,7 +52,7 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes] docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2 tol = 0.2
batch_size = 1000 batch_size = 1000
batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)) batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches assert [len(batch) for batch in batches] == expected_batches

View File

@ -27,7 +27,15 @@ def make_tempdir():
def get_doc( def get_doc(
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None vocab,
words=[],
pos=None,
heads=None,
deps=None,
tags=None,
ents=None,
lemmas=None,
morphs=None,
): ):
"""Create Doc object from given vocab, words and annotations.""" """Create Doc object from given vocab, words and annotations."""
if deps and not heads: if deps and not heads:

View File

@ -9,16 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
ALL_ATTRS = ( ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
"ORTH",
"TAG",
"HEAD",
"DEP",
"ENT_IOB",
"ENT_TYPE",
"LEMMA",
"MORPH"
)
class DocBin(object): class DocBin(object):