mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 07:15:48 +03:00
Format
This commit is contained in:
parent
455dc0d9e2
commit
6a75992af6
|
@ -134,7 +134,7 @@ def verify_cli_args(
|
|||
merge_subtokens,
|
||||
converter,
|
||||
ner_map,
|
||||
lang
|
||||
lang,
|
||||
):
|
||||
if converter == "ner" or converter == "iob":
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
|
@ -148,7 +148,7 @@ def verify_cli_args(
|
|||
else:
|
||||
msg.warn(
|
||||
"Can't automatically detect NER format. Conversion may not",
|
||||
"succeed. See https://spacy.io/api/cli#convert"
|
||||
"succeed. See https://spacy.io/api/cli#convert",
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
|
|
|
@ -178,7 +178,6 @@ def train_cli(
|
|||
)
|
||||
|
||||
|
||||
|
||||
def train(
|
||||
config_path,
|
||||
data_paths,
|
||||
|
@ -238,8 +237,7 @@ def train(
|
|||
tok2vec = tok2vec.get(subpath)
|
||||
if not tok2vec:
|
||||
msg.fail(
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.",
|
||||
exits=1,
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
||||
)
|
||||
tok2vec.from_bytes(weights_data)
|
||||
|
||||
|
@ -351,7 +349,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
|||
try:
|
||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||
except KeyError as e:
|
||||
raise KeyError(Errors.E983.format(dict='score_weights', key=str(e), keys=list(scores.keys())))
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict="score_weights", key=str(e), keys=list(scores.keys())
|
||||
)
|
||||
)
|
||||
|
||||
scores["speed"] = wps
|
||||
return weighted_score, scores
|
||||
|
@ -500,15 +502,23 @@ def setup_printer(training, nlp):
|
|||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(dict='scores (losses)', key=str(e), keys=list(info["losses"].keys())))
|
||||
Errors.E983.format(
|
||||
dict="scores (losses)", key=str(e), keys=list(info["losses"].keys())
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
scores = [
|
||||
"{0:.2f}".format(float(info["other_scores"][col]))
|
||||
for col in score_cols
|
||||
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(Errors.E983.format(dict='scores (other)', key=str(e), keys=list(info["other_scores"].keys())))
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict="scores (other)",
|
||||
key=str(e),
|
||||
keys=list(info["other_scores"].keys()),
|
||||
)
|
||||
)
|
||||
data = (
|
||||
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||
)
|
||||
|
|
|
@ -5,7 +5,9 @@ import itertools
|
|||
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
|
||||
variant_text, variant_token_annot = make_orth_variants(
|
||||
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
|
||||
)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
return example.from_dict(doc, orig_dict)
|
||||
|
|
|
@ -43,10 +43,7 @@ def conllu2json(
|
|||
raw += example.text
|
||||
sentences.append(
|
||||
generate_sentence(
|
||||
example.to_dict(),
|
||||
has_ner_tags,
|
||||
MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
|
||||
)
|
||||
)
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
|
|
|
@ -8,6 +8,7 @@ from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
|||
from ...util import load_model
|
||||
from ...lang.xx import MultiLanguage
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def make_tempdir():
|
||||
d = Path(tempfile.mkdtemp())
|
||||
|
@ -15,11 +16,7 @@ def make_tempdir():
|
|||
shutil.rmtree(str(d))
|
||||
|
||||
|
||||
def json2docs(
|
||||
input_data,
|
||||
model=None,
|
||||
**kwargs
|
||||
):
|
||||
def json2docs(input_data, model=None, **kwargs):
|
||||
nlp = load_model(model) if model is not None else MultiLanguage()
|
||||
docs = []
|
||||
with make_tempdir() as tmp_dir:
|
||||
|
@ -29,10 +26,6 @@ def json2docs(
|
|||
for json_annot in read_json_file(json_path):
|
||||
example_dict = _fix_legacy_dict_data(json_annot)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
doc = annotations2doc(
|
||||
nlp.vocab,
|
||||
tok_dict,
|
||||
doc_dict
|
||||
)
|
||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
|
|
@ -12,6 +12,7 @@ class Corpus:
|
|||
|
||||
DOCS: https://spacy.io/api/goldcorpus
|
||||
"""
|
||||
|
||||
def __init__(self, train_loc, dev_loc, limit=0):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ def biluo_tags_from_doc(doc, missing="O"):
|
|||
return biluo_tags_from_offsets(
|
||||
doc,
|
||||
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
||||
missing=missing
|
||||
missing=missing,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -542,7 +542,6 @@ class Language(object):
|
|||
raise ValueError(Errors.E979.format(type=type(eg)))
|
||||
return converted_examples
|
||||
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples,
|
||||
|
@ -822,7 +821,7 @@ class Language(object):
|
|||
batch_size=batch_size,
|
||||
disable=disable,
|
||||
n_process=n_process,
|
||||
component_cfg=component_cfg
|
||||
component_cfg=component_cfg,
|
||||
)
|
||||
for doc, context in zip(docs, contexts):
|
||||
yield (doc, context)
|
||||
|
|
|
@ -51,7 +51,13 @@ class Lemmatizer(object):
|
|||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
|
||||
if not any(
|
||||
(
|
||||
index_table.get(univ_pos),
|
||||
exc_table.get(univ_pos),
|
||||
rules_table.get(univ_pos),
|
||||
)
|
||||
):
|
||||
if univ_pos == "propn":
|
||||
return [string]
|
||||
else:
|
||||
|
|
|
@ -14,7 +14,7 @@ def BILUO() -> Model[Padded, Padded]:
|
|||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions}
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ def IOB() -> Model[Padded, Padded]:
|
|||
forward,
|
||||
init=init,
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": get_num_actions}
|
||||
attrs={"get_num_actions": get_num_actions},
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
|||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0),
|
||||
Maxout(
|
||||
nO=token_vector_width * 2,
|
||||
nI=token_vector_width,
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
),
|
||||
LayerNorm(token_vector_width * 2),
|
||||
softmax,
|
||||
)
|
||||
|
@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None):
|
|||
# nO = vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
Maxout(
|
||||
nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0
|
||||
nO=nO,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
normalize=True,
|
||||
dropout=0.0,
|
||||
),
|
||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||
)
|
||||
|
@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
|||
def mlm_forward(model, docs, is_train):
|
||||
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
|
||||
output, backprop = model.get_ref("wrapped-model").begin_update(
|
||||
docs
|
||||
) # drop=drop
|
||||
|
||||
def mlm_backward(d_output):
|
||||
d_output *= 1 - mask
|
||||
|
|
|
@ -16,18 +16,14 @@ def build_tb_parser_model(
|
|||
nO=None,
|
||||
):
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
with_array(Linear(hidden_width, t2v_width)),
|
||||
list2array(),
|
||||
)
|
||||
tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
||||
lower = PrecomputableAffine(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces
|
||||
nP=maxout_pieces,
|
||||
)
|
||||
if use_upper:
|
||||
with use_ops("numpy"):
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
import functools
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
|
||||
from thinc.api import (
|
||||
Ops,
|
||||
Model,
|
||||
Linear,
|
||||
Softmax,
|
||||
with_array,
|
||||
softmax_activation,
|
||||
padded2list,
|
||||
)
|
||||
from thinc.api import chain, list2padded, configure_normal_init
|
||||
from thinc.api import Dropout
|
||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
|
||||
|
@ -12,12 +20,12 @@ from ...util import registry
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.BiluoTagger.v1")
|
||||
def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
||||
def BiluoTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
biluo = BILUO()
|
||||
linear = Linear(
|
||||
nO=None,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
init_W=configure_normal_init(mean=0.02)
|
||||
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
|
||||
)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
|
@ -25,7 +33,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
|||
with_array(chain(Dropout(0.1), linear)),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list()
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
|
@ -35,11 +43,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L
|
|||
layers=[model, linear],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.IOBTagger.v1")
|
||||
def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
|
||||
def IOBTagger(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]]
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
biluo = IOB()
|
||||
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
|
||||
model = chain(
|
||||
|
@ -48,7 +59,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
|||
with_array(linear),
|
||||
biluo,
|
||||
with_array(softmax_activation()),
|
||||
padded2list()
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
|
@ -58,11 +69,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis
|
|||
layers=[model],
|
||||
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
|
||||
dims={"nO": None},
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
|
||||
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
|
||||
)
|
||||
|
||||
|
||||
|
||||
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
|
||||
if model.get_dim("nO") is None and Y:
|
||||
model.set_dim("nO", Y[0].shape[1])
|
||||
|
|
|
@ -1,7 +1,30 @@
|
|||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
|
||||
from thinc.api import (
|
||||
Model,
|
||||
reduce_mean,
|
||||
Linear,
|
||||
list2ragged,
|
||||
Logistic,
|
||||
ParametricAttention,
|
||||
)
|
||||
from thinc.api import chain, concatenate, clone, Dropout
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
|
||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
|
||||
from thinc.api import (
|
||||
SparseLinear,
|
||||
Softmax,
|
||||
softmax_activation,
|
||||
Maxout,
|
||||
reduce_sum,
|
||||
Relu,
|
||||
residual,
|
||||
expand_window,
|
||||
)
|
||||
from thinc.api import (
|
||||
HashEmbed,
|
||||
with_ragged,
|
||||
with_array,
|
||||
with_cpu,
|
||||
uniqued,
|
||||
FeatureExtractor,
|
||||
)
|
||||
|
||||
from ..spacy_vectors import SpacyVectors
|
||||
from ... import util
|
||||
|
@ -50,14 +73,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCat.v1")
|
||||
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
|
||||
window_size, conv_depth, dropout, nO=None):
|
||||
def build_text_classifier(
|
||||
width,
|
||||
embed_size,
|
||||
pretrained_vectors,
|
||||
exclusive_classes,
|
||||
ngram_size,
|
||||
window_size,
|
||||
conv_depth,
|
||||
dropout,
|
||||
nO=None,
|
||||
):
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
|
||||
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
|
||||
lower = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
|
||||
)
|
||||
prefix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
|
||||
)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||
|
@ -83,8 +123,15 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
|||
vectors_width = width
|
||||
tok2vec = vector_layer >> with_array(
|
||||
Maxout(width, vectors_width, normalize=True)
|
||||
>> residual((expand_window(window_size=window_size)
|
||||
>> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
|
||||
>> residual(
|
||||
(
|
||||
expand_window(window_size=window_size)
|
||||
>> Maxout(
|
||||
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
||||
)
|
||||
)
|
||||
)
|
||||
** conv_depth,
|
||||
pad=conv_depth,
|
||||
)
|
||||
cnn_model = (
|
||||
|
@ -98,15 +145,16 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
|||
)
|
||||
|
||||
linear_model = build_bow_text_classifier(
|
||||
nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
|
||||
nO=nO,
|
||||
ngram_size=ngram_size,
|
||||
exclusive_classes=exclusive_classes,
|
||||
no_output_layer=False,
|
||||
)
|
||||
nO_double = nO * 2 if nO else None
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||
else:
|
||||
output_layer = (
|
||||
Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
)
|
||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
if model.has_dim("nO") is not False:
|
||||
|
|
|
@ -99,7 +99,13 @@ def hash_charembed_cnn(
|
|||
|
||||
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
||||
def hash_embed_bilstm_v1(
|
||||
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
|
||||
pretrained_vectors,
|
||||
width,
|
||||
depth,
|
||||
embed_size,
|
||||
subword_features,
|
||||
maxout_pieces,
|
||||
dropout,
|
||||
):
|
||||
# Does not use character embeddings: set to False by default
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1(
|
|||
|
||||
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
|
||||
def LayerNormalizedMaxout(width, maxout_pieces):
|
||||
return Maxout(
|
||||
nO=width,
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
)
|
||||
return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
|
||||
def MultiHashEmbed(
|
||||
columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
|
||||
):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
if use_subwords:
|
||||
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
|
||||
)
|
||||
|
||||
if pretrained_vectors:
|
||||
glove = StaticVectors(
|
||||
|
@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
|||
def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
|
||||
Maxout(
|
||||
nO=width,
|
||||
nI=width * ((window_size * 2) + 1),
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
|
@ -247,11 +262,19 @@ def build_Tok2Vec_model(
|
|||
subword_features = False
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
|
||||
norm = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout
|
||||
)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout
|
||||
)
|
||||
else:
|
||||
prefix, suffix, shape = (None, None, None)
|
||||
if pretrained_vectors is not None:
|
||||
|
|
|
@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
|
|||
attrs={
|
||||
"has_upper": has_upper,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output
|
||||
}
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
@ -31,7 +31,7 @@ def forward(model, X, is_train):
|
|||
model.layers,
|
||||
unseen_classes=model.attrs["unseen_classes"],
|
||||
train=is_train,
|
||||
has_upper=model.attrs["has_upper"]
|
||||
has_upper=model.attrs["has_upper"],
|
||||
)
|
||||
|
||||
return step_model, step_model.finish_steps
|
||||
|
@ -62,7 +62,7 @@ def resize_output(model, new_nO):
|
|||
nI = None
|
||||
if smaller.has_dim("nI"):
|
||||
nI = smaller.get_dim("nI")
|
||||
with use_ops('numpy'):
|
||||
with use_ops("numpy"):
|
||||
larger = Linear(nO=new_nO, nI=nI)
|
||||
larger.init = smaller.init
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
|
|
|
@ -21,9 +21,7 @@ class SimpleNER(Pipe):
|
|||
self.model = model
|
||||
self.cfg = {"labels": []}
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(),
|
||||
normalize=True,
|
||||
missing_value=None
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
assert self.model is not None
|
||||
|
||||
|
@ -42,17 +40,17 @@ class SimpleNER(Pipe):
|
|||
def get_tag_names(self):
|
||||
if self.is_biluo:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels] +
|
||||
[f"I-{label}" for label in self.labels] +
|
||||
[f"L-{label}" for label in self.labels] +
|
||||
[f"U-{label}" for label in self.labels] +
|
||||
["O"]
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ [f"L-{label}" for label in self.labels]
|
||||
+ [f"U-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
else:
|
||||
return (
|
||||
[f"B-{label}" for label in self.labels] +
|
||||
[f"I-{label}" for label in self.labels] +
|
||||
["O"]
|
||||
[f"B-{label}" for label in self.labels]
|
||||
+ [f"I-{label}" for label in self.labels]
|
||||
+ ["O"]
|
||||
)
|
||||
|
||||
def predict(self, docs: List[Doc]) -> List[Floats2d]:
|
||||
|
@ -107,7 +105,7 @@ class SimpleNER(Pipe):
|
|||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
self.cfg.update(kwargs)
|
||||
if not hasattr(get_examples, '__call__'):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
gold_tuples = get_examples
|
||||
get_examples = lambda: gold_tuples
|
||||
labels = _get_labels(get_examples())
|
||||
|
@ -121,9 +119,7 @@ class SimpleNER(Pipe):
|
|||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.loss_func = SequenceCategoricalCrossentropy(
|
||||
names=self.get_tag_names(),
|
||||
normalize=True,
|
||||
missing_value=None
|
||||
names=self.get_tag_names(), normalize=True, missing_value=None
|
||||
)
|
||||
|
||||
return sgd
|
||||
|
@ -144,6 +140,6 @@ def _get_labels(examples):
|
|||
labels = set()
|
||||
for eg in examples:
|
||||
for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
|
||||
if ner_tag != 'O' and ner_tag != '-':
|
||||
if ner_tag != "O" and ner_tag != "-":
|
||||
labels.add(ner_tag)
|
||||
return list(sorted(labels))
|
||||
|
|
|
@ -97,7 +97,9 @@ class Scorer(object):
|
|||
for name, component in pipeline:
|
||||
if name == "textcat":
|
||||
self.textcat_multilabel = component.model.attrs["multi_label"]
|
||||
self.textcat_positive_label = component.cfg.get("positive_label", None)
|
||||
self.textcat_positive_label = component.cfg.get(
|
||||
"positive_label", None
|
||||
)
|
||||
for label in component.cfg.get("labels", []):
|
||||
self.textcat_auc_per_cat[label] = ROCAUCScore()
|
||||
self.textcat_f_per_cat[label] = PRFScore()
|
||||
|
@ -359,7 +361,9 @@ class Scorer(object):
|
|||
(gold_i, gold_head, token.dep_.lower())
|
||||
)
|
||||
# Find all NER labels in gold and doc
|
||||
ent_labels = set([k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents])
|
||||
ent_labels = set(
|
||||
[k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
|
||||
)
|
||||
# Set up all labels for per type scoring and prepare gold per type
|
||||
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||
for ent_label in ent_labels:
|
||||
|
@ -392,7 +396,10 @@ class Scorer(object):
|
|||
self.pos.score_set(cand_pos, gold_pos)
|
||||
self.morphs.score_set(cand_morphs, gold_morphs)
|
||||
for field in self.morphs_per_feat:
|
||||
self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
|
||||
self.morphs_per_feat[field].score_set(
|
||||
cand_morphs_per_feat.get(field, set()),
|
||||
gold_morphs_per_feat.get(field, set()),
|
||||
)
|
||||
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
|
||||
self.labelled.score_set(cand_deps, gold_deps)
|
||||
for dep in self.labelled_per_dep:
|
||||
|
@ -404,7 +411,9 @@ class Scorer(object):
|
|||
)
|
||||
if (
|
||||
len(gold_doc.cats) > 0
|
||||
and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold_doc.cats)
|
||||
and set(self.textcat_f_per_cat)
|
||||
== set(self.textcat_auc_per_cat)
|
||||
== set(gold_doc.cats)
|
||||
and set(gold_doc.cats) == set(doc.cats)
|
||||
):
|
||||
goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
|
||||
|
|
|
@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner
|
|||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
|
@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
def test_ents_reset(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(en_vocab, default_ner(), **config)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
|
|
|
@ -17,7 +17,12 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(vocab, default_parser(), **config)
|
||||
return parser
|
||||
|
||||
|
@ -35,10 +40,7 @@ def _train_parser(parser):
|
|||
for i in range(5):
|
||||
losses = {}
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
gold = {
|
||||
"heads": [1, 1, 3, 3],
|
||||
"deps": ["left", "ROOT", "left", "ROOT"]
|
||||
}
|
||||
gold = {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
|
||||
example = Example.from_dict(doc, gold)
|
||||
parser.update([example], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
@ -51,10 +53,7 @@ def test_add_label(parser):
|
|||
for i in range(100):
|
||||
losses = {}
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
gold = {
|
||||
"heads": [1, 1, 3, 3],
|
||||
"deps": ["right", "ROOT", "left", "ROOT"]
|
||||
}
|
||||
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
|
||||
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||
doc = parser(doc)
|
||||
|
@ -63,7 +62,12 @@ def test_add_label(parser):
|
|||
|
||||
|
||||
def test_add_label_deserializes_correctly():
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||
ner1.add_label("C")
|
||||
ner1.add_label("B")
|
||||
|
@ -78,6 +82,7 @@ def test_add_label_deserializes_correctly():
|
|||
for i in range(ner1.moves.n_moves):
|
||||
assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pipe_cls,n_moves,model",
|
||||
[(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
|
||||
|
|
|
@ -139,7 +139,12 @@ def test_get_oracle_actions():
|
|||
deps.append(dep)
|
||||
ents.append(ent)
|
||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(doc.vocab, default_parser(), **config)
|
||||
parser.moves.add_action(0, "")
|
||||
parser.moves.add_action(1, "")
|
||||
|
@ -151,7 +156,9 @@ def test_get_oracle_actions():
|
|||
parser.moves.add_action(2, dep)
|
||||
elif head < i:
|
||||
parser.moves.add_action(3, dep)
|
||||
example = Example.from_dict(doc, {"words": words, "tags": tags, "heads": heads, "deps": deps})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}
|
||||
)
|
||||
parser.moves.get_oracle_sequence(example)
|
||||
|
||||
|
||||
|
|
|
@ -143,7 +143,12 @@ def test_accept_blocked_token():
|
|||
# 1. test normal behaviour
|
||||
nlp1 = English()
|
||||
doc1 = nlp1("I live in New York")
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config)
|
||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||
|
@ -162,7 +167,12 @@ def test_accept_blocked_token():
|
|||
# 2. test blocking behaviour
|
||||
nlp2 = English()
|
||||
doc2 = nlp2("I live in New York")
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
|
@ -220,7 +230,12 @@ def test_overwrite_token():
|
|||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||
|
||||
# Check that a new ner can overwrite O
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
|
|
|
@ -29,7 +29,12 @@ def tok2vec():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab, arc_eager):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
return Parser(vocab, model=default_parser(), moves=arc_eager, **config)
|
||||
|
||||
|
||||
|
|
|
@ -180,6 +180,7 @@ def test_parser_set_sent_starts(en_vocab):
|
|||
for token in sent:
|
||||
assert token.head in sent
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
|
|
|
@ -16,7 +16,12 @@ def vocab():
|
|||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(vocab, default_parser(), **config)
|
||||
parser.cfg["token_vector_width"] = 4
|
||||
parser.cfg["hidden_width"] = 32
|
||||
|
@ -28,7 +33,9 @@ def parser(vocab):
|
|||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||
example = Example.from_dict(doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]})
|
||||
example = Example.from_dict(
|
||||
doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
|
||||
)
|
||||
parser.update([example], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
|
|
|
@ -272,11 +272,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
|||
def test_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
|
@ -293,7 +295,11 @@ def test_overfitting_IO():
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
|
||||
mykb.add_alias(
|
||||
alias="Russ Cochran",
|
||||
entities=["Q2146908", "Q7381115"],
|
||||
probabilities=[0.5, 0.5],
|
||||
)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
|
||||
|
|
|
@ -15,8 +15,17 @@ def test_label_types():
|
|||
|
||||
|
||||
TRAIN_DATA = [
|
||||
("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
|
||||
("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
|
||||
(
|
||||
"I like green eggs",
|
||||
{
|
||||
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
|
||||
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"Eat blue ham",
|
||||
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
@ -38,7 +47,12 @@ def test_overfitting_IO():
|
|||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
||||
gold_morphs = [
|
||||
"Feat=N|POS=NOUN",
|
||||
"Feat=V|POS=VERB",
|
||||
"Feat=J|POS=ADJ",
|
||||
"Feat=N|POS=NOUN",
|
||||
]
|
||||
assert [t.morph_ for t in doc] == gold_morphs
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
|
|
|
@ -7,24 +7,28 @@ from spacy.pipeline.simple_ner import SimpleNER
|
|||
import spacy
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
["PER", "ORG", "LOC", "MISC"],
|
||||
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
|
||||
])
|
||||
["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"],
|
||||
]
|
||||
)
|
||||
def labels(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ops():
|
||||
return NumpyOps()
|
||||
|
||||
|
||||
def _get_actions(labels):
|
||||
action_names = (
|
||||
[f"B{label}" for label in labels] + \
|
||||
[f"I{label}" for label in labels] + \
|
||||
[f"L{label}" for label in labels] + \
|
||||
[f"U{label}" for label in labels] + \
|
||||
["O"]
|
||||
[f"B{label}" for label in labels]
|
||||
+ [f"I{label}" for label in labels]
|
||||
+ [f"L{label}" for label in labels]
|
||||
+ [f"U{label}" for label in labels]
|
||||
+ ["O"]
|
||||
)
|
||||
A = namedtuple("actions", action_names)
|
||||
return A(**{name: i for i, name in enumerate(action_names)})
|
||||
|
|
|
@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer):
|
|||
|
||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||
def test_issue1967(label):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||
example = Example.from_dict(
|
||||
Doc(ner.vocab, words=["word"]),
|
||||
|
@ -280,8 +285,8 @@ def test_issue1967(label):
|
|||
"tags": ["tag"],
|
||||
"heads": [0],
|
||||
"deps": ["dep"],
|
||||
"entities": [label]
|
||||
}
|
||||
"entities": [label],
|
||||
},
|
||||
)
|
||||
assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1]
|
||||
|
||||
|
|
|
@ -196,7 +196,12 @@ def test_issue3345():
|
|||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||
doc[4].is_sent_start = True
|
||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(doc.vocab, default_ner(), **config)
|
||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||
ner.moves.add_action(5, "")
|
||||
|
|
|
@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser
|
|||
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
|
@ -16,7 +21,12 @@ def test_issue3830_no_subtok():
|
|||
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
|
|
|
@ -74,7 +74,12 @@ def test_issue4042_bug2():
|
|||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
|
|
@ -16,7 +16,12 @@ def test_issue4313():
|
|||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
|
||||
|
|
|
@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
|||
|
||||
@pytest.fixture
|
||||
def parser(en_vocab):
|
||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(en_vocab, default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
return parser
|
||||
|
|
|
@ -36,7 +36,9 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||
assert new_vocab1.to_bytes() == vocab1_b
|
||||
assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(
|
||||
strings1 + list(default_strings)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
|
|
@ -3,6 +3,7 @@ import pytest
|
|||
from spacy.lang.en import English
|
||||
from spacy.gold.converters import iob2docs, conll_ner2docs
|
||||
from spacy.cli.pretrain import make_docs
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters import conllu2docs
|
||||
|
||||
|
|
|
@ -155,7 +155,18 @@ def test_gold_biluo_misalign(en_vocab):
|
|||
def test_split_sentences(en_vocab):
|
||||
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
|
||||
gold_words = [
|
||||
"I",
|
||||
"flew",
|
||||
"to",
|
||||
"San",
|
||||
"Francisco",
|
||||
"Valley",
|
||||
"had",
|
||||
"loads",
|
||||
"of",
|
||||
"fun",
|
||||
]
|
||||
sent_starts = [True, False, False, False, False, False, True, False, False, False]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||
|
@ -166,7 +177,16 @@ def test_split_sentences(en_vocab):
|
|||
|
||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
|
||||
gold_words = [
|
||||
"I",
|
||||
"flew",
|
||||
"to",
|
||||
"San Francisco",
|
||||
"Valley",
|
||||
"had",
|
||||
"loads of",
|
||||
"fun",
|
||||
]
|
||||
sent_starts = [True, False, False, False, False, True, False, False]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||
|
@ -195,7 +215,15 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == [
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"LOC",
|
||||
"LOC",
|
||||
"LOC",
|
||||
"",
|
||||
]
|
||||
|
||||
# misaligned
|
||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||
|
@ -206,11 +234,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
entities = [(offset_start, offset_end, "LOC")]
|
||||
links = {(offset_start, offset_end): {"Q816843": 1.0}}
|
||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "entities": entities, "links": links}
|
||||
)
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
|
||||
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
|
||||
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
|
||||
assert example.get_aligned("ENT_KB_ID", as_string=True) == [
|
||||
"",
|
||||
"",
|
||||
"Q816843",
|
||||
"Q816843",
|
||||
"",
|
||||
]
|
||||
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
|
||||
"Q816843": 1.0
|
||||
}
|
||||
|
||||
# additional whitespace tokens in GoldParse words
|
||||
words, spaces = get_words_and_spaces(
|
||||
|
@ -221,26 +259,55 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
|
||||
gold_spaces = [True, True, False, True, False, False]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
|
||||
)
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == [
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"LOC",
|
||||
"LOC",
|
||||
"",
|
||||
]
|
||||
|
||||
# from issue #4791
|
||||
doc = en_tokenizer("I'll return the ₹54 amount")
|
||||
gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
|
||||
gold_spaces = [False, True, True, True, False, True, False]
|
||||
entities = [(16, 19, "MONEY")]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
|
||||
)
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == [
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"MONEY",
|
||||
"",
|
||||
]
|
||||
|
||||
doc = en_tokenizer("I'll return the $54 amount")
|
||||
gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
|
||||
gold_spaces = [False, True, True, True, False, True, False]
|
||||
entities = [(16, 19, "MONEY")]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
|
||||
)
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == [
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"MONEY",
|
||||
"MONEY",
|
||||
"",
|
||||
]
|
||||
|
||||
|
||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||
|
@ -311,7 +378,9 @@ def test_roundtrip_docs_to_json(doc):
|
|||
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
||||
assert deps == [t.dep_ for t in reloaded_example.reference]
|
||||
assert heads == [t.head.i for t in reloaded_example.reference]
|
||||
assert ents == [(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents]
|
||||
assert ents == [
|
||||
(e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents
|
||||
]
|
||||
assert "TRAVEL" in reloaded_example.reference.cats
|
||||
assert "BAKING" in reloaded_example.reference.cats
|
||||
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
|
||||
|
@ -375,7 +444,9 @@ def test_ignore_misaligned(doc):
|
|||
|
||||
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
||||
# because the only example can't be aligned
|
||||
train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True))
|
||||
train_reloaded_example = list(
|
||||
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
|
||||
)
|
||||
assert len(train_reloaded_example) == 0
|
||||
|
||||
|
||||
|
@ -389,7 +460,9 @@ def test_make_orth_variants(doc):
|
|||
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
train_example = next(goldcorpus.train_dataset(nlp))
|
||||
variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
variant_example = make_orth_variants_example(
|
||||
nlp, train_example, orth_variant_level=0.2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -430,7 +503,9 @@ def test_goldparse_startswith_space(en_tokenizer):
|
|||
entities = ["U-DATE"]
|
||||
deps = ["ROOT"]
|
||||
heads = [0]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "deps":deps, "heads": heads})
|
||||
example = Example.from_dict(
|
||||
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
|
||||
)
|
||||
assert example.get_aligned("ENT_IOB") == [None, 3]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == [None, "DATE"]
|
||||
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
|
||||
|
@ -441,7 +516,12 @@ def test_gold_constructor():
|
|||
nlp = English()
|
||||
doc = nlp("This is a sentence")
|
||||
example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}})
|
||||
assert example.get_aligned("ORTH", as_string=True) == ["This", "is", "a", "sentence"]
|
||||
assert example.get_aligned("ORTH", as_string=True) == [
|
||||
"This",
|
||||
"is",
|
||||
"a",
|
||||
"sentence",
|
||||
]
|
||||
assert example.reference.cats["cat1"]
|
||||
assert not example.reference.cats["cat2"]
|
||||
|
||||
|
@ -496,7 +576,7 @@ def test_split_sents(merged_dict):
|
|||
nlp = English()
|
||||
example = Example.from_dict(
|
||||
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
|
||||
merged_dict
|
||||
merged_dict,
|
||||
)
|
||||
assert example.text == "Hi there everyone It is just me"
|
||||
|
||||
|
@ -522,10 +602,7 @@ def test_tuples_to_example(vocab, merged_dict):
|
|||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||
merged_dict = dict(merged_dict)
|
||||
merged_dict["cats"] = cats
|
||||
ex = Example.from_dict(
|
||||
Doc(vocab, words=merged_dict["words"]),
|
||||
merged_dict
|
||||
)
|
||||
ex = Example.from_dict(Doc(vocab, words=merged_dict["words"]), merged_dict)
|
||||
words = [token.text for token in ex.reference]
|
||||
assert words == merged_dict["words"]
|
||||
tags = [token.tag_ for token in ex.reference]
|
||||
|
|
|
@ -36,9 +36,7 @@ def test_language_update(nlp):
|
|||
|
||||
def test_language_evaluate(nlp):
|
||||
text = "hello world"
|
||||
annots = {
|
||||
"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||
}
|
||||
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
||||
doc = Doc(nlp.vocab, words=text.split(" "))
|
||||
# Evaluate with text and dict
|
||||
nlp.evaluate([(text, annots)])
|
||||
|
|
|
@ -32,7 +32,9 @@ def test_Example_from_dict_invalid(annots):
|
|||
Example.from_dict(predicted, annots)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]])
|
||||
@pytest.mark.parametrize(
|
||||
"pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]
|
||||
)
|
||||
@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
|
||||
def test_Example_from_dict_with_tags(pred_words, annots):
|
||||
vocab = Vocab()
|
||||
|
@ -161,7 +163,15 @@ def test_Example_from_dict_with_entities(annots):
|
|||
example = Example.from_dict(predicted, annots)
|
||||
|
||||
assert len(list(example.reference.ents)) == 2
|
||||
assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"]
|
||||
assert [example.reference[i].ent_iob_ for i in range(7)] == [
|
||||
"O",
|
||||
"O",
|
||||
"B",
|
||||
"I",
|
||||
"O",
|
||||
"B",
|
||||
"O",
|
||||
]
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
|
||||
|
||||
assert example.reference[2].ent_type_ == "LOC"
|
||||
|
@ -174,7 +184,10 @@ def test_Example_from_dict_with_entities(annots):
|
|||
[
|
||||
{
|
||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||
"entities": [(0, 4, "LOC"), (21, 27, "LOC")], # not aligned to token boundaries
|
||||
"entities": [
|
||||
(0, 4, "LOC"),
|
||||
(21, 27, "LOC"),
|
||||
], # not aligned to token boundaries
|
||||
}
|
||||
],
|
||||
)
|
||||
|
@ -192,7 +205,10 @@ def test_Example_from_dict_with_entities_invalid(annots):
|
|||
{
|
||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
|
||||
"links": {
|
||||
(7, 15): {"Q60": 1.0, "Q64": 0.0},
|
||||
(20, 26): {"Q60": 0.0, "Q64": 1.0},
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
@ -224,4 +240,3 @@ def test_Example_from_dict_with_links_invalid(annots):
|
|||
predicted = Doc(vocab, words=annots["words"])
|
||||
with pytest.raises(ValueError):
|
||||
Example.from_dict(predicted, annots)
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ test_ner_apple = [
|
|||
]
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tagged_doc():
|
||||
text = "Sarah's sister flew to Silicon Valley via London."
|
||||
|
|
|
@ -26,7 +26,9 @@ def test_util_minibatch(doc_sizes, expected_batches):
|
|||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True))
|
||||
batches = list(
|
||||
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
|
||||
)
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
max_size = batch_size + batch_size * tol
|
||||
|
@ -50,7 +52,7 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
|||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False))
|
||||
batches = list(
|
||||
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
|
||||
)
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
|
||||
|
|
|
@ -27,7 +27,15 @@ def make_tempdir():
|
|||
|
||||
|
||||
def get_doc(
|
||||
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None
|
||||
vocab,
|
||||
words=[],
|
||||
pos=None,
|
||||
heads=None,
|
||||
deps=None,
|
||||
tags=None,
|
||||
ents=None,
|
||||
lemmas=None,
|
||||
morphs=None,
|
||||
):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
if deps and not heads:
|
||||
|
|
|
@ -9,16 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
|
|||
from ..errors import Errors
|
||||
|
||||
|
||||
ALL_ATTRS = (
|
||||
"ORTH",
|
||||
"TAG",
|
||||
"HEAD",
|
||||
"DEP",
|
||||
"ENT_IOB",
|
||||
"ENT_TYPE",
|
||||
"LEMMA",
|
||||
"MORPH"
|
||||
)
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
|
||||
|
||||
|
||||
class DocBin(object):
|
||||
|
|
Loading…
Reference in New Issue
Block a user