mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-21 10:24:26 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
129e832306
2
Makefile
2
Makefile
|
@ -22,13 +22,11 @@ override WHEELHOUSE = "./wheelhouse"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
|
dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
|
||||||
$(VENV)/bin/pex \
|
$(VENV)/bin/pex \
|
||||||
-f $(WHEELHOUSE) \
|
-f $(WHEELHOUSE) \
|
||||||
--no-index \
|
--no-index \
|
||||||
--disable-cache \
|
--disable-cache \
|
||||||
-m spacy \
|
|
||||||
-o $@ \
|
-o $@ \
|
||||||
$(package)==$(version) \
|
$(package)==$(version) \
|
||||||
$(SPACY_EXTRAS)
|
$(SPACY_EXTRAS)
|
||||||
|
|
|
@ -1,133 +0,0 @@
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
raw = null
|
|
||||||
init_tok2vec = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system:seed}
|
|
||||||
dropout = 0.1
|
|
||||||
init_tok2vec = ${paths:init_tok2vec}
|
|
||||||
vectors = null
|
|
||||||
accumulate_gradient = 1
|
|
||||||
max_steps = 0
|
|
||||||
max_epochs = 0
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
|
|
||||||
frozen_components = []
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
|
||||||
discard_oversize = false
|
|
||||||
tolerance = 0.2
|
|
||||||
|
|
||||||
[training.batcher.size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
load_vocab_data = false
|
|
||||||
pipeline = ["tok2vec", "ner", "tagger", "parser"]
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 30
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = 2000
|
|
||||||
also_embed_subwords = true
|
|
||||||
also_use_static_vectors = false
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
|
@ -1,152 +0,0 @@
|
||||||
# Training hyper-parameters and additional features.
|
|
||||||
[training]
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length or number of examples.
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
# Data augmentation
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
dropout = 0.1
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
|
||||||
patience = 1600
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 20000
|
|
||||||
eval_frequency = 400
|
|
||||||
# Other settings
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 1
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
|
||||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
|
||||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
|
||||||
# These settings are invalid for the transformer models.
|
|
||||||
init_tok2vec = null
|
|
||||||
discard_oversize = false
|
|
||||||
omit_extra_lookups = false
|
|
||||||
batch_by = "words"
|
|
||||||
use_gpu = -1
|
|
||||||
raw_text = null
|
|
||||||
tag_map = null
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 1000
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[pretraining]
|
|
||||||
max_epochs = 1000
|
|
||||||
min_length = 5
|
|
||||||
max_length = 500
|
|
||||||
dropout = 0.2
|
|
||||||
n_save_every = null
|
|
||||||
batch_size = 3000
|
|
||||||
seed = ${training:seed}
|
|
||||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
|
||||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
|
||||||
|
|
||||||
[pretraining.objective]
|
|
||||||
type = "characters"
|
|
||||||
n_characters = 4
|
|
||||||
|
|
||||||
[pretraining.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
base_model = null
|
|
||||||
|
|
||||||
[nlp.pipeline]
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.senter]
|
|
||||||
factory = "senter"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.senter.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.senter.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 256
|
|
||||||
depth = 6
|
|
||||||
window_size = 1
|
|
||||||
embed_size = 10000
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = null
|
|
|
@ -1,73 +0,0 @@
|
||||||
# Training hyper-parameters and additional features.
|
|
||||||
[training]
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length or number of examples.
|
|
||||||
max_length = 3000
|
|
||||||
limit = 0
|
|
||||||
# Data augmentation
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
dropout = 0.1
|
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
|
||||||
patience = 100000
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 0
|
|
||||||
eval_frequency = 1000
|
|
||||||
# Other settings
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 1
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
# Control how scores are printed and checkpoints are evaluated.
|
|
||||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
|
||||||
score_weights = {"ents_f": 1.0}
|
|
||||||
# These settings are invalid for the transformer models.
|
|
||||||
init_tok2vec = null
|
|
||||||
discard_oversize = false
|
|
||||||
omit_extra_lookups = false
|
|
||||||
batch_by = "words"
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = true
|
|
||||||
eps = 1e-8
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
embed_size = 2000
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = ${training:dropout}
|
|
|
@ -1,73 +0,0 @@
|
||||||
[training]
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
dropout = 0.2
|
|
||||||
init_tok2vec = null
|
|
||||||
vectors = null
|
|
||||||
max_epochs = 100
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
use_gpu = 0
|
|
||||||
scores = ["tags_acc", "uas", "las"]
|
|
||||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
|
||||||
limit = 0
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = ${training:vectors}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[nlp.pipeline.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
beam_width = 1
|
|
||||||
beam_update_prob = 1.0
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
[nlp.pipeline.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model:width}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
|
||||||
pretrained_vectors = ${nlp:vectors}
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
embed_size = 2000
|
|
||||||
subword_features = true
|
|
||||||
maxout_pieces = 3
|
|
||||||
dropout = null
|
|
|
@ -1,110 +0,0 @@
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
raw = null
|
|
||||||
init_tok2vec = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
use_pytorch_for_gpu_memory = false
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system:seed}
|
|
||||||
dropout = 0.2
|
|
||||||
init_tok2vec = ${paths:init_tok2vec}
|
|
||||||
vectors = null
|
|
||||||
accumulate_gradient = 1
|
|
||||||
max_steps = 0
|
|
||||||
max_epochs = 0
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
|
|
||||||
|
|
||||||
[training.read_train]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.read_dev]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = ${training.read_train:gold_preproc}
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
|
||||||
discard_oversize = false
|
|
||||||
tolerance = 0.2
|
|
||||||
|
|
||||||
[training.batcher.size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["tok2vec", "tagger", "parser"]
|
|
||||||
load_vocab_data = false
|
|
||||||
|
|
||||||
[nlp.tokenizer]
|
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
learn_tokens = false
|
|
||||||
min_action_freq = 1
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = 2000
|
|
||||||
also_embed_subwords = true
|
|
||||||
also_use_static_vectors = false
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
|
@ -1,69 +0,0 @@
|
||||||
[training]
|
|
||||||
use_gpu = -1
|
|
||||||
limit = 0
|
|
||||||
dropout = 0.2
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
scores = ["ents_f"]
|
|
||||||
score_weights = {"ents_f": 1}
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
batch_size = 25
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.extract]
|
|
||||||
@architectures = "spacy.CharacterEmbed.v1"
|
|
||||||
width = 96
|
|
||||||
nM = 64
|
|
||||||
nC = 8
|
|
||||||
rows = 2000
|
|
||||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
|
||||||
dropout = null
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.extract.features]
|
|
||||||
@architectures = "spacy.Doc2Feats.v1"
|
|
||||||
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.LayerNormalizedMaxout.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
||||||
maxout_pieces = 4
|
|
||||||
|
|
||||||
[nlp.pipeline.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 2
|
|
||||||
depth = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecTensors.v1"
|
|
||||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
|
@ -1,51 +0,0 @@
|
||||||
[training]
|
|
||||||
use_gpu = -1
|
|
||||||
limit = 0
|
|
||||||
dropout = 0.2
|
|
||||||
patience = 10000
|
|
||||||
eval_frequency = 200
|
|
||||||
scores = ["ents_p", "ents_r", "ents_f"]
|
|
||||||
score_weights = {"ents_f": 1}
|
|
||||||
orth_variant_level = 0.0
|
|
||||||
gold_preproc = true
|
|
||||||
max_length = 0
|
|
||||||
seed = 0
|
|
||||||
accumulate_gradient = 2
|
|
||||||
discard_oversize = false
|
|
||||||
|
|
||||||
[training.batch_size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 3000
|
|
||||||
stop = 3000
|
|
||||||
compound = 1.001
|
|
||||||
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
learn_rate = 0.001
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
vectors = null
|
|
||||||
|
|
||||||
[nlp.pipeline.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
|
|
||||||
[nlp.pipeline.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
width = 128
|
|
||||||
depth = 4
|
|
||||||
embed_size = 7000
|
|
||||||
maxout_pieces = 3
|
|
||||||
window_size = 1
|
|
||||||
subword_features = true
|
|
||||||
pretrained_vectors = null
|
|
||||||
dropout = null
|
|
|
@ -18,6 +18,7 @@ from .util import registry, logger # noqa: F401
|
||||||
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .language import Language
|
from .language import Language
|
||||||
|
from .vocab import Vocab
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,12 +47,22 @@ def load(
|
||||||
return util.load_model(name, disable=disable, exclude=exclude, config=config)
|
return util.load_model(name, disable=disable, exclude=exclude, config=config)
|
||||||
|
|
||||||
|
|
||||||
def blank(name: str, **overrides) -> Language:
|
def blank(
|
||||||
|
name: str,
|
||||||
|
*,
|
||||||
|
vocab: Union[Vocab, bool] = True,
|
||||||
|
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||||
|
meta: Dict[str, Any] = util.SimpleFrozenDict()
|
||||||
|
) -> Language:
|
||||||
"""Create a blank nlp object for a given language code.
|
"""Create a blank nlp object for a given language code.
|
||||||
|
|
||||||
name (str): The language code, e.g. "en".
|
name (str): The language code, e.g. "en".
|
||||||
**overrides: Keyword arguments passed to language subclass on init.
|
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||||
|
config (Dict[str, Any] / Config): Optional config overrides.
|
||||||
|
meta (Dict[str, Any]): Overrides for nlp.meta.
|
||||||
RETURNS (Language): The nlp object.
|
RETURNS (Language): The nlp object.
|
||||||
"""
|
"""
|
||||||
LangClass = util.get_lang_class(name)
|
LangClass = util.get_lang_class(name)
|
||||||
return LangClass(**overrides)
|
# We should accept both dot notation and nested dict here for consistency
|
||||||
|
config = util.dot_to_dict(config)
|
||||||
|
return LangClass.from_config(config, meta=meta)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a18"
|
__version__ = "3.0.0a19"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -30,6 +30,7 @@ def init_config_cli(
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -43,7 +44,14 @@ def init_config_cli(
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
optimize = optimize.value
|
optimize = optimize.value
|
||||||
pipeline = string_to_list(pipeline)
|
pipeline = string_to_list(pipeline)
|
||||||
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
init_config(
|
||||||
|
output_file,
|
||||||
|
lang=lang,
|
||||||
|
pipeline=pipeline,
|
||||||
|
optimize=optimize,
|
||||||
|
cpu=cpu,
|
||||||
|
pretraining=pretraining,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("fill-config")
|
@init_cli.command("fill-config")
|
||||||
|
@ -51,7 +59,7 @@ def init_fill_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
||||||
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
@ -109,7 +117,13 @@ def fill_config(
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
output_file: Path,
|
||||||
|
*,
|
||||||
|
lang: str,
|
||||||
|
pipeline: List[str],
|
||||||
|
optimize: str,
|
||||||
|
cpu: bool,
|
||||||
|
pretraining: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
msg = Printer(no_print=is_stdout)
|
msg = Printer(no_print=is_stdout)
|
||||||
|
@ -156,8 +170,13 @@ def init_config(
|
||||||
with show_validation_error(hint_fill=False):
|
with show_validation_error(hint_fill=False):
|
||||||
config = util.load_config_from_str(base_template)
|
config = util.load_config_from_str(base_template)
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||||
|
config = nlp.config
|
||||||
|
if pretraining:
|
||||||
|
validate_config_for_pretrain(config, msg)
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
config = pretrain_config.merge(config)
|
||||||
msg.good("Auto-filled config with all values")
|
msg.good("Auto-filled config with all values")
|
||||||
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
save_config(config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
def save_config(
|
def save_config(
|
||||||
|
|
|
@ -1,13 +1,23 @@
|
||||||
[pretraining]
|
[pretraining]
|
||||||
max_epochs = 1000
|
max_epochs = 1000
|
||||||
min_length = 5
|
|
||||||
max_length = 500
|
|
||||||
dropout = 0.2
|
dropout = 0.2
|
||||||
n_save_every = null
|
n_save_every = null
|
||||||
batch_size = 3000
|
component = "tok2vec"
|
||||||
seed = ${system.seed}
|
layer = ""
|
||||||
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
|
|
||||||
tok2vec_model = "components.tok2vec.model"
|
[pretraining.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
size = 3000
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[pretraining.corpus]
|
||||||
|
@readers = "spacy.JsonlReader.v1"
|
||||||
|
path = ${paths.raw}
|
||||||
|
min_length = 5
|
||||||
|
max_length = 500
|
||||||
|
limit = 0
|
||||||
|
|
||||||
[pretraining.objective]
|
[pretraining.objective]
|
||||||
type = "characters"
|
type = "characters"
|
||||||
|
|
|
@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
warnings.warn(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -119,6 +119,11 @@ class Warnings:
|
||||||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
|
W106 = ("Both HEAD and SENT_START are included as attributes in "
|
||||||
|
"doc.from_array(). The parse trees based on the HEAD attribute "
|
||||||
|
"will override the values in SENT_START.")
|
||||||
|
W107 = ("The property Doc.{prop} is deprecated. Use "
|
||||||
|
"Doc.has_annotation(\"{attr}\") instead.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -192,11 +197,6 @@ class Errors:
|
||||||
"Alternatively, add the dependency parser, or set sentence "
|
"Alternatively, add the dependency parser, or set sentence "
|
||||||
"boundaries by setting doc[i].is_sent_start.")
|
"boundaries by setting doc[i].is_sent_start.")
|
||||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||||
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
|
||||||
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
|
||||||
"boundaries implicitly, based on the tree structure. This means "
|
|
||||||
"the HEAD attribute would potentially override the sentence "
|
|
||||||
"boundaries set by SENT_START.")
|
|
||||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||||
"length {length}.")
|
"length {length}.")
|
||||||
|
@ -397,8 +397,8 @@ class Errors:
|
||||||
E154 = ("One of the attributes or values is not supported for token "
|
E154 = ("One of the attributes or values is not supported for token "
|
||||||
"patterns. Please use the option validate=True with Matcher, "
|
"patterns. Please use the option validate=True with Matcher, "
|
||||||
"PhraseMatcher, or EntityRuler for more details.")
|
"PhraseMatcher, or EntityRuler for more details.")
|
||||||
E155 = ("The pipeline needs to include a tagger in order to use "
|
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||||
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
|
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
||||||
"instead of list(nlp.tokenizer.pipe()).")
|
"instead of list(nlp.tokenizer.pipe()).")
|
||||||
E156 = ("The pipeline needs to include a parser in order to use "
|
E156 = ("The pipeline needs to include a parser in order to use "
|
||||||
|
@ -480,6 +480,9 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
||||||
|
"values are an instance of spacy.vocab.Vocab or True to create one"
|
||||||
|
" (default).")
|
||||||
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
|
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
|
||||||
"data that does not appear to be a binary classification problem "
|
"data that does not appear to be a binary classification problem "
|
||||||
"with two labels. Labels found: {labels}")
|
"with two labels. Labels found: {labels}")
|
||||||
|
@ -552,7 +555,10 @@ class Errors:
|
||||||
"to register a simple stateless function component that just takes "
|
"to register a simple stateless function component that just takes "
|
||||||
"a Doc and returns it.")
|
"a Doc and returns it.")
|
||||||
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
||||||
"language code of current Language subclass {lang} ({lang_code})")
|
"language code of current Language subclass {lang} ({lang_code}). "
|
||||||
|
"If you want to create an nlp object from a config, make sure to "
|
||||||
|
"use the matching subclass with the language-specific settings and "
|
||||||
|
"data.")
|
||||||
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
|
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
|
||||||
E960 = ("No config data found for component '{name}'. This is likely a bug "
|
E960 = ("No config data found for component '{name}'. This is likely a bug "
|
||||||
"in spaCy.")
|
"in spaCy.")
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
|
@ -17,4 +21,22 @@ class Bengali(Language):
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Bengali.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
doc = doclike.doc
|
doc = doclike.doc
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
return
|
return
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +24,22 @@ class Persian(Language):
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Persian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -19,7 +19,7 @@ def noun_chunks(doclike):
|
||||||
]
|
]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
|
@ -20,4 +24,22 @@ class Norwegian(Language):
|
||||||
Defaults = NorwegianDefaults
|
Defaults = NorwegianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Norwegian.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...lookups import Lookups
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -22,4 +27,22 @@ class Swedish(Language):
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Swedish.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||||
|
scores=["lemma_acc"],
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
lookups: Optional[Lookups],
|
||||||
|
):
|
||||||
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
|
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import warnings
|
import warnings
|
||||||
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
|
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -144,6 +144,8 @@ class Language:
|
||||||
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
||||||
self._pipe_configs: Dict[str, Config] = {} # config by component
|
self._pipe_configs: Dict[str, Config] = {} # config by component
|
||||||
|
|
||||||
|
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||||
|
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = create_vocab(
|
vocab = create_vocab(
|
||||||
|
@ -396,8 +398,6 @@ class Language:
|
||||||
if name not in self._pipe_configs:
|
if name not in self._pipe_configs:
|
||||||
raise ValueError(Errors.E960.format(name=name))
|
raise ValueError(Errors.E960.format(name=name))
|
||||||
pipe_config = self._pipe_configs[name]
|
pipe_config = self._pipe_configs[name]
|
||||||
pipe_config.pop("nlp", None)
|
|
||||||
pipe_config.pop("name", None)
|
|
||||||
return pipe_config
|
return pipe_config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -650,6 +650,10 @@ class Language:
|
||||||
filled = Config(filled[factory_name])
|
filled = Config(filled[factory_name])
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
|
# Remove the extra values we added because we don't want to keep passing
|
||||||
|
# them around, copying them etc.
|
||||||
|
filled.pop("nlp", None)
|
||||||
|
filled.pop("name", None)
|
||||||
# Merge the final filled config with the raw config (including non-
|
# Merge the final filled config with the raw config (including non-
|
||||||
# interpolated variables)
|
# interpolated variables)
|
||||||
if raw_config:
|
if raw_config:
|
||||||
|
@ -1444,10 +1448,15 @@ class Language:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
effectively share weights.
|
effectively share weights.
|
||||||
"""
|
"""
|
||||||
|
# I had though, "Why do we do this inside the Language object? Shouldn't
|
||||||
|
# it be the tok2vec/transformer/etc's job?
|
||||||
|
# The problem is we need to do it during deserialization...And the
|
||||||
|
# components don't receive the pipeline then. So this does have to be
|
||||||
|
# here :(
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if hasattr(proc1, "find_listeners"):
|
if hasattr(proc1, "find_listeners"):
|
||||||
for name2, proc2 in self.pipeline[i:]:
|
for name2, proc2 in self.pipeline[i+1:]:
|
||||||
if hasattr(proc2, "model"):
|
if isinstance(getattr(proc2, "model", None), Model):
|
||||||
proc1.find_listeners(proc2.model)
|
proc1.find_listeners(proc2.model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -1458,6 +1467,7 @@ class Language:
|
||||||
vocab: Union[Vocab, bool] = True,
|
vocab: Union[Vocab, bool] = True,
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
exclude: Iterable[str] = SimpleFrozenList(),
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
|
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
auto_fill: bool = True,
|
auto_fill: bool = True,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
|
@ -1472,6 +1482,7 @@ class Language:
|
||||||
explicitly enable them by calling nlp.enable_pipe.
|
explicitly enable them by calling nlp.enable_pipe.
|
||||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||||
Excluded components won't be loaded.
|
Excluded components won't be loaded.
|
||||||
|
meta (Dict[str, Any]): Meta overrides for nlp.meta.
|
||||||
auto_fill (bool): Automatically fill in missing values in config based
|
auto_fill (bool): Automatically fill in missing values in config based
|
||||||
on defaults and function argument annotations.
|
on defaults and function argument annotations.
|
||||||
validate (bool): Validate the component config and arguments against
|
validate (bool): Validate the component config and arguments against
|
||||||
|
@ -1487,7 +1498,7 @@ class Language:
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"]["lang"]
|
||||||
if cls.lang is not None and config_lang is not None and config_lang != cls.lang:
|
if config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E958.format(
|
Errors.E958.format(
|
||||||
bad_lang_code=config["nlp"]["lang"],
|
bad_lang_code=config["nlp"]["lang"],
|
||||||
|
@ -1525,7 +1536,7 @@ class Language:
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
# and then again when we load from disk.
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
|
|
|
@ -17,7 +17,7 @@ from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
|
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
|
@ -215,10 +215,15 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||||
cdef Pool tmp_pool = Pool()
|
cdef Pool tmp_pool = Pool()
|
||||||
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
|
||||||
and not doc.is_tagged:
|
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||||
raise ValueError(Errors.E155.format())
|
if POS in self._seen_attrs and not doc.has_annotation("POS"):
|
||||||
if DEP in self._seen_attrs and not doc.is_parsed:
|
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||||
|
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
|
||||||
|
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||||
|
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
|
||||||
|
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||||
|
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E156.format())
|
raise ValueError(Errors.E156.format())
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates)
|
extensions=self._extensions, predicates=self._extra_predicates)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
|
@ -184,12 +184,20 @@ cdef class PhraseMatcher:
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
continue
|
continue
|
||||||
if isinstance(doc, Doc):
|
if isinstance(doc, Doc):
|
||||||
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
|
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||||
raise ValueError(Errors.E155.format())
|
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
if self.attr == DEP and not doc.is_parsed:
|
if self.attr == TAG and not has_annotation[TAG]:
|
||||||
|
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||||
|
if self.attr == POS and not has_annotation[POS]:
|
||||||
|
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||||
|
if self.attr == MORPH and not has_annotation[MORPH]:
|
||||||
|
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||||
|
if self.attr == LEMMA and not has_annotation[LEMMA]:
|
||||||
|
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||||
|
if self.attr == DEP and not has_annotation[DEP]:
|
||||||
raise ValueError(Errors.E156.format())
|
raise ValueError(Errors.E156.format())
|
||||||
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
if self._validate and any(has_annotation.values()) \
|
||||||
and self.attr not in (DEP, POS, TAG, LEMMA):
|
and self.attr not in attrs:
|
||||||
string_attr = self.vocab.strings[self.attr]
|
string_attr = self.vocab.strings[self.attr]
|
||||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||||
keyword = self._convert_to_array(doc)
|
keyword = self._convert_to_array(doc)
|
||||||
|
|
|
@ -164,7 +164,7 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
each word, taken from the beginning and end of the word equally. Padding is
|
each word, taken from the beginning and end of the word equally. Padding is
|
||||||
|
@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
are between 3 and 8, although it may depend on the length of words in the
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
language.
|
language.
|
||||||
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
model = chain(
|
if also_use_static_vectors:
|
||||||
concatenate(
|
model = chain(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
concatenate(
|
||||||
chain(
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
FeatureExtractor([NORM]),
|
chain(
|
||||||
list2ragged(),
|
FeatureExtractor([NORM]),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
StaticVectors(width, dropout=0.0),
|
||||||
),
|
),
|
||||||
),
|
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
ragged2list(),
|
||||||
ragged2list(),
|
)
|
||||||
|
else:
|
||||||
|
model = chain(
|
||||||
|
concatenate(
|
||||||
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
chain(
|
||||||
|
FeatureExtractor([NORM]),
|
||||||
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||||
|
ragged2list(),
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -679,8 +679,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
st._sent[i].dep = self.root_label
|
st._sent[i].dep = self.root_label
|
||||||
|
|
||||||
def finalize_doc(self, Doc doc):
|
def finalize_doc(self, Doc doc):
|
||||||
doc.is_parsed = True
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
set_children_from_heads(doc.c, doc.length)
|
|
||||||
|
|
||||||
def has_gold(self, Example eg, start=0, end=None):
|
def has_gold(self, Example eg, start=0, end=None):
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
|
|
|
@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
|
||||||
new_head = _find_new_head(doc[i], head_label)
|
new_head = _find_new_head(doc[i], head_label)
|
||||||
doc.c[i].head = new_head.i - i
|
doc.c[i].head = new_head.i - i
|
||||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||||
set_children_from_heads(doc.c, doc.length)
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||||
"""
|
"""
|
||||||
if not doc.is_parsed:
|
if not doc.has_annotation("DEP"):
|
||||||
return doc
|
return doc
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for np in doc.noun_chunks:
|
for np in doc.noun_chunks:
|
||||||
|
|
|
@ -32,6 +32,7 @@ width = 128
|
||||||
rows = 7000
|
rows = 7000
|
||||||
nM = 64
|
nM = 64
|
||||||
nC = 8
|
nC = 8
|
||||||
|
also_use_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
@ -203,8 +204,6 @@ class Morphologizer(Tagger):
|
||||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||||
|
|
||||||
doc.is_morphed = True
|
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
"""Find the loss and gradient of loss for the batch of documents and
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
their predicted scores.
|
their predicted scores.
|
||||||
|
@ -259,79 +258,3 @@ class Morphologizer(Tagger):
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||||
"morph", **kwargs))
|
"morph", **kwargs))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to a bytestring.
|
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (bytes): The serialized object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
|
||||||
"""
|
|
||||||
serialize = {}
|
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from a bytestring.
|
|
||||||
|
|
||||||
bytes_data (bytes): The serialized pipe.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
|
||||||
"""
|
|
||||||
def load_model(b):
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(b)
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
|
||||||
"model": lambda b: load_model(b),
|
|
||||||
}
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to disk.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
|
||||||
"""
|
|
||||||
serialize = {
|
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
|
||||||
}
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
|
||||||
"""
|
|
||||||
def load_model(p):
|
|
||||||
with p.open("rb") as file_:
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(file_.read())
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
|
||||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
|
||||||
"model": load_model,
|
|
||||||
}
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
|
@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
|
||||||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||||
del results["sents_per_type"]
|
del results["sents_per_type"]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to a bytestring.
|
|
||||||
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (bytes): The serialized object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
|
||||||
"""
|
|
||||||
serialize = {}
|
|
||||||
serialize["model"] = self.model.to_bytes
|
|
||||||
serialize["vocab"] = self.vocab.to_bytes
|
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from a bytestring.
|
|
||||||
|
|
||||||
bytes_data (bytes): The serialized pipe.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
|
||||||
"""
|
|
||||||
def load_model(b):
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(b)
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
|
||||||
"model": lambda b: load_model(b),
|
|
||||||
}
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Serialize the pipe to disk.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
|
||||||
"""
|
|
||||||
serialize = {
|
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
|
||||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
|
||||||
}
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
|
||||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
|
||||||
|
|
||||||
path (str / Path): Path to a directory.
|
|
||||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
||||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
|
||||||
"""
|
|
||||||
def load_model(p):
|
|
||||||
with p.open("rb") as file_:
|
|
||||||
try:
|
|
||||||
self.model.from_bytes(file_.read())
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(Errors.E149) from None
|
|
||||||
|
|
||||||
deserialize = {
|
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
|
||||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
|
||||||
"model": load_model,
|
|
||||||
}
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
|
@ -168,7 +168,6 @@ class Tagger(Pipe):
|
||||||
# Don't clobber preset POS tags
|
# Don't clobber preset POS tags
|
||||||
if doc.c[j].tag == 0:
|
if doc.c[j].tag == 0:
|
||||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
|
|
@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
tokens[0].lemma_ = "lemma"
|
tokens[0].lemma_ = "lemma"
|
||||||
tokens[0].norm_ = "norm"
|
tokens[0].norm_ = "norm"
|
||||||
|
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
|
||||||
tokens[0].ent_kb_id_ = "ent_kb_id"
|
tokens[0].ent_kb_id_ = "ent_kb_id"
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
|
@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_api_sents_empty_string(en_tokenizer):
|
def test_doc_api_sents_empty_string(en_tokenizer):
|
||||||
doc = en_tokenizer("")
|
doc = en_tokenizer("")
|
||||||
doc.is_parsed = True
|
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert len(sents) == 0
|
assert len(sents) == 0
|
||||||
|
|
||||||
|
@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert doc[6].text == "for"
|
assert doc[6].text == "for"
|
||||||
subtree = [w.text for w in doc[6].subtree]
|
subtree = [w.text for w in doc[6].subtree]
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
|
||||||
)
|
)
|
||||||
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||||
tokens = en_tokenizer(sentence)
|
tokens = en_tokenizer(sentence)
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
|
||||||
|
)
|
||||||
lca = doc.get_lca_matrix()
|
lca = doc.get_lca_matrix()
|
||||||
assert (lca == lca_matrix).all()
|
assert (lca == lca_matrix).all()
|
||||||
assert lca[1, 1] == 1
|
assert lca[1, 1] == 1
|
||||||
|
@ -251,51 +254,55 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||||
def test_doc_is_nered(en_vocab):
|
def test_doc_is_nered(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York"]
|
words = ["I", "live", "in", "New", "York"]
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
assert not doc.is_nered
|
assert not doc.has_annotation("ENT_IOB")
|
||||||
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
||||||
assert doc.is_nered
|
assert doc.has_annotation("ENT_IOB")
|
||||||
# Test creating doc from array with unknown values
|
# Test creating doc from array with unknown values
|
||||||
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
||||||
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
||||||
assert doc.is_nered
|
assert doc.has_annotation("ENT_IOB")
|
||||||
# Test serialization
|
# Test serialization
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
assert new_doc.is_nered
|
assert new_doc.has_annotation("ENT_IOB")
|
||||||
|
|
||||||
|
|
||||||
def test_doc_from_array_sent_starts(en_vocab):
|
def test_doc_from_array_sent_starts(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
|
||||||
# fmt: off
|
# fmt: off
|
||||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
|
||||||
doc[i].dep_ = dep
|
|
||||||
doc[i].head = doc[head]
|
|
||||||
if head == i:
|
|
||||||
doc[i].is_sent_start = True
|
|
||||||
doc.is_parsed
|
|
||||||
|
|
||||||
|
# HEAD overrides SENT_START with warning
|
||||||
attrs = [SENT_START, HEAD]
|
attrs = [SENT_START, HEAD]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
new_doc = Doc(en_vocab, words=words)
|
new_doc = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.warns(UserWarning):
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
|
|
||||||
attrs = [SENT_START, DEP]
|
# no warning using default attrs
|
||||||
|
attrs = doc._get_array_attrs()
|
||||||
|
arr = doc.to_array(attrs)
|
||||||
|
with pytest.warns(None) as record:
|
||||||
|
new_doc.from_array(attrs, arr)
|
||||||
|
assert len(record) == 0
|
||||||
|
|
||||||
|
# only SENT_START uses SENT_START
|
||||||
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
new_doc = Doc(en_vocab, words=words)
|
new_doc = Doc(en_vocab, words=words)
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||||
assert not new_doc.is_parsed
|
assert not new_doc.has_annotation("DEP")
|
||||||
|
|
||||||
|
# only HEAD uses HEAD
|
||||||
attrs = [HEAD, DEP]
|
attrs = [HEAD, DEP]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
new_doc = Doc(en_vocab, words=words)
|
new_doc = Doc(en_vocab, words=words)
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||||
assert new_doc.is_parsed
|
assert new_doc.has_annotation("DEP")
|
||||||
|
|
||||||
|
|
||||||
def test_doc_from_array_morph(en_vocab):
|
def test_doc_from_array_morph(en_vocab):
|
||||||
|
@ -365,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||||
with pytest.raises(ValueError):
|
|
||||||
# important attributes from sentenziser or parser are missing
|
|
||||||
assert list(m_doc.sents)
|
|
||||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||||
# space delimiter considered, although spacy attribute was missing
|
# space delimiter considered, although spacy attribute was missing
|
||||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||||
|
@ -379,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||||
|
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||||
|
docs = [en_tokenizer(t) for t in texts]
|
||||||
|
docs[0].ents = ()
|
||||||
|
docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
|
||||||
|
doc = Doc.from_docs(docs)
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_doc_lang(en_vocab):
|
def test_doc_lang(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
assert doc.lang_ == "en"
|
assert doc.lang_ == "en"
|
||||||
|
@ -399,3 +412,45 @@ def test_token_lexeme(en_vocab):
|
||||||
assert isinstance(token.lex, Lexeme)
|
assert isinstance(token.lex, Lexeme)
|
||||||
assert token.lex.text == token.text
|
assert token.lex.text == token.text
|
||||||
assert en_vocab[token.orth] == token.lex
|
assert en_vocab[token.orth] == token.lex
|
||||||
|
|
||||||
|
|
||||||
|
def test_has_annotation(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||||
|
for attr in attrs:
|
||||||
|
assert not doc.has_annotation(attr)
|
||||||
|
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
doc[0].pos_ = "X"
|
||||||
|
doc[0].morph_ = "Feat=Val"
|
||||||
|
doc[0].lemma_ = "a"
|
||||||
|
doc[0].dep_ = "dep"
|
||||||
|
doc[0].head = doc[1]
|
||||||
|
doc.ents = [Span(doc, 0, 1, label="HELLO")]
|
||||||
|
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
doc[1].tag_ = "A"
|
||||||
|
doc[1].pos_ = "X"
|
||||||
|
doc[1].morph_ = ""
|
||||||
|
doc[1].lemma_ = "a"
|
||||||
|
doc[1].dep_ = "dep"
|
||||||
|
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||||
|
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_flags_deprecated(en_tokenizer):
|
||||||
|
doc = en_tokenizer("test")
|
||||||
|
with pytest.deprecated_call():
|
||||||
|
doc.is_tagged
|
||||||
|
with pytest.deprecated_call():
|
||||||
|
doc.is_parsed
|
||||||
|
with pytest.deprecated_call():
|
||||||
|
doc.is_nered
|
||||||
|
with pytest.deprecated_call():
|
||||||
|
doc.is_sentenced
|
||||||
|
|
|
@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
|
||||||
text = "This is a sentence. This is another sentence. And a third."
|
text = "This is a sentence. This is another sentence. And a third."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||||
doc.is_parsed = False
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
|
||||||
def test_spans_root2(en_tokenizer):
|
def test_spans_root2(en_tokenizer):
|
||||||
text = "through North and South Carolina"
|
text = "through North and South Carolina"
|
||||||
heads = [0, 3, -1, -2, -4]
|
heads = [0, 3, -1, -2, -4]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert doc[-2:].root.text == "Carolina"
|
assert doc[-2:].root.text == "Carolina"
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
"""Test span's lca matrix generation"""
|
"""Test span's lca matrix generation"""
|
||||||
tokens = en_tokenizer("the lazy dog slept")
|
tokens = en_tokenizer("the lazy dog slept")
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
|
||||||
lca = doc[:2].get_lca_matrix()
|
lca = doc[:2].get_lca_matrix()
|
||||||
assert lca.shape == (2, 2)
|
assert lca.shape == (2, 2)
|
||||||
assert lca[0, 0] == 0 # the & the -> the
|
assert lca[0, 0] == 0 # the & the -> the
|
||||||
|
|
|
@ -112,11 +112,11 @@ def test_doc_token_api_ancestors(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_head_setter(en_tokenizer):
|
def test_doc_token_api_head_setter(en_tokenizer):
|
||||||
# the structure of this sentence depends on the English annotation scheme
|
|
||||||
text = "Yesterday I saw a dog that barked loudly."
|
text = "Yesterday I saw a dog that barked loudly."
|
||||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
assert doc[6].n_lefts == 1
|
assert doc[6].n_lefts == 1
|
||||||
assert doc[6].n_rights == 1
|
assert doc[6].n_rights == 1
|
||||||
|
@ -169,13 +169,46 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc[0].head = doc2[0]
|
doc[0].head = doc2[0]
|
||||||
|
|
||||||
|
# test sentence starts when two sentences are joined
|
||||||
|
text = "This is one sentence. This is another sentence."
|
||||||
|
heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=heads,
|
||||||
|
deps=["dep"] * len(heads),
|
||||||
|
)
|
||||||
|
# initially two sentences
|
||||||
|
assert doc[0].is_sent_start
|
||||||
|
assert doc[5].is_sent_start
|
||||||
|
assert doc[0].left_edge == doc[0]
|
||||||
|
assert doc[0].right_edge == doc[4]
|
||||||
|
assert doc[5].left_edge == doc[5]
|
||||||
|
assert doc[5].right_edge == doc[9]
|
||||||
|
|
||||||
|
# modifying with a sentence doesn't change sent starts
|
||||||
|
doc[2].head = doc[3]
|
||||||
|
assert doc[0].is_sent_start
|
||||||
|
assert doc[5].is_sent_start
|
||||||
|
assert doc[0].left_edge == doc[0]
|
||||||
|
assert doc[0].right_edge == doc[4]
|
||||||
|
assert doc[5].left_edge == doc[5]
|
||||||
|
assert doc[5].right_edge == doc[9]
|
||||||
|
|
||||||
|
# attach the second sentence to the first, resulting in one sentence
|
||||||
|
doc[5].head = doc[0]
|
||||||
|
assert doc[0].is_sent_start
|
||||||
|
assert not doc[5].is_sent_start
|
||||||
|
assert doc[0].left_edge == doc[0]
|
||||||
|
assert doc[0].right_edge == doc[9]
|
||||||
|
|
||||||
|
|
||||||
def test_is_sent_start(en_tokenizer):
|
def test_is_sent_start(en_tokenizer):
|
||||||
doc = en_tokenizer("This is a sentence. This is another.")
|
doc = en_tokenizer("This is a sentence. This is another.")
|
||||||
assert doc[5].is_sent_start is None
|
assert doc[5].is_sent_start is None
|
||||||
doc[5].is_sent_start = True
|
doc[5].is_sent_start = True
|
||||||
assert doc[5].is_sent_start is True
|
assert doc[5].is_sent_start is True
|
||||||
doc.is_parsed = True
|
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -184,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
|
||||||
assert doc[4].is_sent_end is None
|
assert doc[4].is_sent_end is None
|
||||||
doc[5].is_sent_start = True
|
doc[5].is_sent_start = True
|
||||||
assert doc[4].is_sent_end is True
|
assert doc[4].is_sent_end is True
|
||||||
doc.is_parsed = True
|
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,14 +241,14 @@ def test_token0_has_sent_start_true():
|
||||||
doc = Doc(Vocab(), words=["hello", "world"])
|
doc = Doc(Vocab(), words=["hello", "world"])
|
||||||
assert doc[0].is_sent_start is True
|
assert doc[0].is_sent_start is True
|
||||||
assert doc[1].is_sent_start is None
|
assert doc[1].is_sent_start is None
|
||||||
assert not doc.is_sentenced
|
assert not doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
|
||||||
def test_tokenlast_has_sent_end_true():
|
def test_tokenlast_has_sent_end_true():
|
||||||
doc = Doc(Vocab(), words=["hello", "world"])
|
doc = Doc(Vocab(), words=["hello", "world"])
|
||||||
assert doc[0].is_sent_end is None
|
assert doc[0].is_sent_end is None
|
||||||
assert doc[1].is_sent_end is True
|
assert doc[1].is_sent_end is True
|
||||||
assert not doc.is_sentenced
|
assert not doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
|
||||||
def test_token_api_conjuncts_chain(en_vocab):
|
def test_token_api_conjuncts_chain(en_vocab):
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = de_tokenizer("Er lag auf seinem")
|
doc = de_tokenizer("Er lag auf seinem")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -11,12 +11,8 @@ from ...util import get_doc
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
|
||||||
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text + punct)
|
tokens = en_tokenizer(text + punct)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert len(doc) == 4 if punct else 3
|
assert len(doc) == 4 if punct else 3
|
||||||
assert len(list(doc.sents)) == 1
|
assert len(list(doc.sents)) == 1
|
||||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = es_tokenizer("en Oxford este verano")
|
doc = es_tokenizer("en Oxford este verano")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -3,12 +3,8 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = id_tokenizer("sebelas")
|
doc = id_tokenizer("sebelas")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
|
@ -5,12 +5,8 @@ from ...util import get_doc
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||||
To check this test, we're constructing a Doc
|
|
||||||
with a new Vocab here and forcing is_parsed to 'False'
|
|
||||||
to make sure the noun chunks don't run.
|
|
||||||
"""
|
"""
|
||||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||||
doc.is_parsed = False
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ru, uk
|
# excluded: ru, uk
|
||||||
# excluded for custom tables: pl
|
# excluded for custom tables: pl
|
||||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
|
||||||
|
|
||||||
def test_attr_pipeline_checks(en_vocab):
|
def test_attr_pipeline_checks(en_vocab):
|
||||||
doc1 = Doc(en_vocab, words=["Test"])
|
doc1 = Doc(en_vocab, words=["Test"])
|
||||||
doc1.is_parsed = True
|
doc1[0].dep_ = "ROOT"
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2.is_tagged = True
|
doc2[0].tag_ = "TAG"
|
||||||
|
doc2[0].pos_ = "X"
|
||||||
|
doc2[0].morph_ = "Feat=Val"
|
||||||
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires is_parsed
|
# DEP requires DEP
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", [[{"DEP": "a"}]])
|
matcher.add("TEST", [[{"DEP": "a"}]])
|
||||||
matcher(doc1)
|
matcher(doc1)
|
||||||
|
@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher(doc3)
|
matcher(doc3)
|
||||||
# TAG, POS, LEMMA require is_tagged
|
# TAG, POS, LEMMA require those values
|
||||||
for attr in ("TAG", "POS", "LEMMA"):
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", [[{attr: "a"}]])
|
matcher.add("TEST", [[{attr: "a"}]])
|
||||||
|
|
|
@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
|
||||||
|
|
||||||
def test_phrase_matcher_validation(en_vocab):
|
def test_phrase_matcher_validation(en_vocab):
|
||||||
doc1 = Doc(en_vocab, words=["Test"])
|
doc1 = Doc(en_vocab, words=["Test"])
|
||||||
doc1.is_parsed = True
|
doc1[0].dep_ = "ROOT"
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2.is_tagged = True
|
doc2[0].tag_ = "TAG"
|
||||||
|
doc2[0].pos_ = "X"
|
||||||
|
doc2[0].morph_ = "Feat=Val"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
|
||||||
|
|
||||||
def test_attr_pipeline_checks(en_vocab):
|
def test_attr_pipeline_checks(en_vocab):
|
||||||
doc1 = Doc(en_vocab, words=["Test"])
|
doc1 = Doc(en_vocab, words=["Test"])
|
||||||
doc1.is_parsed = True
|
doc1[0].dep_ = "ROOT"
|
||||||
doc2 = Doc(en_vocab, words=["Test"])
|
doc2 = Doc(en_vocab, words=["Test"])
|
||||||
doc2.is_tagged = True
|
doc2[0].tag_ = "TAG"
|
||||||
|
doc2[0].pos_ = "X"
|
||||||
|
doc2[0].morph_ = "Feat=Val"
|
||||||
|
doc2[0].lemma_ = "LEMMA"
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires is_parsed
|
# DEP requires DEP
|
||||||
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
# TAG, POS, LEMMA require is_tagged
|
# TAG, POS, LEMMA require those values
|
||||||
for attr in ("TAG", "POS", "LEMMA"):
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
matcher = PhraseMatcher(en_vocab, attr=attr)
|
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
|
|
|
@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
|
||||||
def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
||||||
text = "The four wheels on the bus turned quickly"
|
text = "The four wheels on the bus turned quickly"
|
||||||
heads = [2, 1, 4, -1, 1, -2, 0, -1]
|
heads = [2, 1, 4, -1, 1, -2, 0, -1]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert len(list(doc[2].lefts)) == 2
|
assert len(list(doc[2].lefts)) == 2
|
||||||
assert len(list(doc[2].rights)) == 1
|
assert len(list(doc[2].rights)) == 1
|
||||||
assert len(list(doc[2].children)) == 3
|
assert len(list(doc[2].children)) == 3
|
||||||
|
@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
|
||||||
if i == 0 or i == 3:
|
if i == 0 or i == 3:
|
||||||
assert doc[i].is_sent_start is True
|
assert doc[i].is_sent_start is True
|
||||||
else:
|
else:
|
||||||
assert doc[i].is_sent_start is None
|
assert doc[i].is_sent_start is False
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
for token in sent:
|
for token in sent:
|
||||||
assert token.head in sent
|
assert token.head in sent
|
||||||
|
|
|
@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||||
|
|
||||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
|
||||||
|
|
||||||
lefts = {}
|
lefts = {}
|
||||||
rights = {}
|
rights = {}
|
||||||
|
|
|
@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
|
||||||
def test_parser_space_attachment(en_tokenizer):
|
def test_parser_space_attachment(en_tokenizer):
|
||||||
text = "This is a test.\nTo ensure spaces are attached well."
|
text = "This is a test.\nTo ensure spaces are attached well."
|
||||||
heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
|
heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
if len(sent) == 1:
|
if len(sent) == 1:
|
||||||
assert not sent[-1].is_space
|
assert not sent[-1].is_space
|
||||||
|
|
|
@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||||
|
assert doc.has_annotation("LEMMA")
|
||||||
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
|
@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||||
|
assert doc.has_annotation("LEMMA")
|
||||||
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from asset
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
|
@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||||
|
assert doc.has_annotation("LEMMA")
|
||||||
|
assert doc.has_annotation("MORPH")
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_score(nlp, pattern_dicts):
|
def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
|
|
|
@ -35,8 +35,6 @@ def doc2(en_tokenizer):
|
||||||
deps=deps,
|
deps=deps,
|
||||||
)
|
)
|
||||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
||||||
doc.is_parsed = True
|
|
||||||
doc.is_tagged = True
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -345,7 +345,10 @@ def test_language_factories_invalid():
|
||||||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
||||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
||||||
),
|
),
|
||||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
|
(
|
||||||
|
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
|
||||||
|
{"a": 0.25, "b": 0.75},
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_language_factories_combine_score_weights(weights, expected):
|
def test_language_factories_combine_score_weights(weights, expected):
|
||||||
|
@ -360,10 +363,16 @@ def test_language_factories_scores():
|
||||||
weights1 = {"a1": 0.5, "a2": 0.5}
|
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||||
Language.factory(
|
Language.factory(
|
||||||
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
f"{name}1",
|
||||||
|
scores=list(weights1),
|
||||||
|
default_score_weights=weights1,
|
||||||
|
func=func,
|
||||||
)
|
)
|
||||||
Language.factory(
|
Language.factory(
|
||||||
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
f"{name}2",
|
||||||
|
scores=list(weights2),
|
||||||
|
default_score_weights=weights2,
|
||||||
|
func=func,
|
||||||
)
|
)
|
||||||
meta1 = Language.get_factory_meta(f"{name}1")
|
meta1 = Language.get_factory_meta(f"{name}1")
|
||||||
assert meta1.default_score_weights == weights1
|
assert meta1.default_score_weights == weights1
|
||||||
|
@ -461,3 +470,21 @@ def test_pipe_factories_decorator_idempotent():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(name)
|
nlp.add_pipe(name)
|
||||||
Language.component(name2, func=func2)
|
Language.component(name2, func=func2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipe_factories_config_excludes_nlp():
|
||||||
|
"""Test that the extra values we temporarily add to component config
|
||||||
|
blocks/functions are removed and not copied around.
|
||||||
|
"""
|
||||||
|
name = "test_pipe_factories_config_excludes_nlp"
|
||||||
|
func = lambda nlp, name: lambda doc: doc
|
||||||
|
Language.factory(name, func=func)
|
||||||
|
config = {
|
||||||
|
"nlp": {"lang": "en", "pipeline": [name]},
|
||||||
|
"components": {name: {"factory": name}},
|
||||||
|
}
|
||||||
|
nlp = English.from_config(config)
|
||||||
|
assert nlp.pipe_names == [name]
|
||||||
|
pipe_cfg = nlp.get_pipe_config(name)
|
||||||
|
pipe_cfg == {"factory": name}
|
||||||
|
assert nlp._pipe_configs[name] == {"factory": name}
|
||||||
|
|
|
@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
|
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
|
||||||
sentencizer = Sentencizer(punct_chars=None)
|
sentencizer = Sentencizer(punct_chars=None)
|
||||||
doc = sentencizer(doc)
|
doc = sentencizer(doc)
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
sent_starts = [t.is_sent_start for t in doc]
|
sent_starts = [t.is_sent_start for t in doc]
|
||||||
sent_ends = [t.is_sent_end for t in doc]
|
sent_ends = [t.is_sent_end for t in doc]
|
||||||
assert sent_starts == [True, False, True, False, False, False, False]
|
assert sent_starts == [True, False, True, False, False, False, False]
|
||||||
|
@ -22,13 +22,13 @@ def test_sentencizer_pipe():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
sent_starts = [t.is_sent_start for t in doc]
|
sent_starts = [t.is_sent_start for t in doc]
|
||||||
assert sent_starts == [True, False, True, False, False, False, False]
|
assert sent_starts == [True, False, True, False, False, False, False]
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
for ex in nlp.pipe(texts):
|
for ex in nlp.pipe(texts):
|
||||||
doc = ex.doc
|
doc = ex.doc
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
sent_starts = [t.is_sent_start for t in doc]
|
sent_starts = [t.is_sent_start for t in doc]
|
||||||
assert sent_starts == [True, False, True, False, False, False, False]
|
assert sent_starts == [True, False, True, False, False, False, False]
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
|
for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
sent_starts = [t.is_sent_start for t in doc]
|
sent_starts = [t.is_sent_start for t in doc]
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
assert sent_starts == []
|
assert sent_starts == []
|
||||||
|
@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
sentencizer = Sentencizer(punct_chars=None)
|
sentencizer = Sentencizer(punct_chars=None)
|
||||||
doc = sentencizer(doc)
|
doc = sentencizer(doc)
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
assert [t.is_sent_start for t in doc] == sent_starts
|
assert [t.is_sent_start for t in doc] == sent_starts
|
||||||
assert [t.is_sent_end for t in doc] == sent_ends
|
assert [t.is_sent_end for t in doc] == sent_ends
|
||||||
assert len(list(doc.sents)) == n_sents
|
assert len(list(doc.sents)) == n_sents
|
||||||
|
@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||||
doc = sentencizer(doc)
|
doc = sentencizer(doc)
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
assert [t.is_sent_start for t in doc] == sent_starts
|
assert [t.is_sent_start for t in doc] == sent_starts
|
||||||
assert [t.is_sent_end for t in doc] == sent_ends
|
assert [t.is_sent_end for t in doc] == sent_ends
|
||||||
assert len(list(doc.sents)) == n_sents
|
assert len(list(doc.sents)) == n_sents
|
||||||
|
|
|
@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
|
||||||
doc = get_doc(
|
doc = get_doc(
|
||||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||||
)
|
)
|
||||||
doc.is_parsed = True
|
|
||||||
assert len(doc) == 1
|
assert len(doc) == 1
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert len(sents) == 1
|
assert len(sents) == 1
|
||||||
|
@ -170,11 +169,9 @@ def test_issue595():
|
||||||
|
|
||||||
def test_issue599(en_vocab):
|
def test_issue599(en_vocab):
|
||||||
doc = Doc(en_vocab)
|
doc = Doc(en_vocab)
|
||||||
doc.is_tagged = True
|
|
||||||
doc.is_parsed = True
|
|
||||||
doc2 = Doc(doc.vocab)
|
doc2 = Doc(doc.vocab)
|
||||||
doc2.from_bytes(doc.to_bytes())
|
doc2.from_bytes(doc.to_bytes())
|
||||||
assert doc2.is_parsed
|
assert doc2.has_annotation("DEP")
|
||||||
|
|
||||||
|
|
||||||
def test_issue600():
|
def test_issue600():
|
||||||
|
|
|
@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_issue1506():
|
def test_issue1506():
|
||||||
|
@ -198,17 +198,26 @@ def test_issue1834():
|
||||||
"""Test that sentence boundaries & parse/tag flags are not lost
|
"""Test that sentence boundaries & parse/tag flags are not lost
|
||||||
during serialization."""
|
during serialization."""
|
||||||
string = "This is a first sentence . And another one"
|
string = "This is a first sentence . And another one"
|
||||||
doc = Doc(Vocab(), words=string.split())
|
words = string.split()
|
||||||
doc[6].sent_start = True
|
doc = get_doc(Vocab(), words=words)
|
||||||
|
doc[6].is_sent_start = True
|
||||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||||
assert new_doc[6].sent_start
|
assert new_doc[6].sent_start
|
||||||
assert not new_doc.is_parsed
|
assert not new_doc.has_annotation("DEP")
|
||||||
assert not new_doc.is_tagged
|
assert not new_doc.has_annotation("TAG")
|
||||||
doc.is_parsed = True
|
doc = get_doc(
|
||||||
doc.is_tagged = True
|
Vocab(),
|
||||||
|
words=words,
|
||||||
|
tags=["TAG"] * len(words),
|
||||||
|
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||||
|
deps=["dep"] * len(words),
|
||||||
|
)
|
||||||
|
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
|
||||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||||
assert new_doc.is_parsed
|
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
|
||||||
assert new_doc.is_tagged
|
assert new_doc[6].sent_start
|
||||||
|
assert new_doc.has_annotation("DEP")
|
||||||
|
assert new_doc.has_annotation("TAG")
|
||||||
|
|
||||||
|
|
||||||
def test_issue1868():
|
def test_issue1868():
|
||||||
|
|
|
@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
|
||||||
def test_issue2361(de_tokenizer):
|
def test_issue2361(de_tokenizer):
|
||||||
chars = ("<", ">", "&", """)
|
chars = ("<", ">", "&", """)
|
||||||
doc = de_tokenizer('< > & " ')
|
doc = de_tokenizer('< > & " ')
|
||||||
doc.is_parsed = True
|
|
||||||
doc.is_tagged = True
|
|
||||||
html = render(doc)
|
html = render(doc)
|
||||||
for char in chars:
|
for char in chars:
|
||||||
assert char in html
|
assert char in html
|
||||||
|
@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
|
||||||
def test_issue2396(en_vocab):
|
def test_issue2396(en_vocab):
|
||||||
words = ["She", "created", "a", "test", "for", "spacy"]
|
words = ["She", "created", "a", "test", "for", "spacy"]
|
||||||
heads = [1, 0, 1, -2, -1, -1]
|
heads = [1, 0, 1, -2, -1, -1]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
matrix = numpy.array(
|
matrix = numpy.array(
|
||||||
[
|
[
|
||||||
[0, 1, 1, 1, 1, 1],
|
[0, 1, 1, 1, 1, 1],
|
||||||
|
@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
|
||||||
],
|
],
|
||||||
dtype=numpy.int32,
|
dtype=numpy.int32,
|
||||||
)
|
)
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
span = doc[:]
|
span = doc[:]
|
||||||
assert (doc.get_lca_matrix() == matrix).all()
|
assert (doc.get_lca_matrix() == matrix).all()
|
||||||
assert (span.get_lca_matrix() == matrix).all()
|
assert (span.get_lca_matrix() == matrix).all()
|
||||||
|
|
|
@ -16,16 +16,16 @@ from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_issue2564():
|
def test_issue2564():
|
||||||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
doc = nlp("hello world")
|
doc = nlp("hello world")
|
||||||
assert doc.is_tagged
|
assert doc.has_annotation("TAG")
|
||||||
docs = nlp.pipe(["hello", "world"])
|
docs = nlp.pipe(["hello", "world"])
|
||||||
piped_doc = next(docs)
|
piped_doc = next(docs)
|
||||||
assert piped_doc.is_tagged
|
assert piped_doc.has_annotation("TAG")
|
||||||
|
|
||||||
|
|
||||||
def test_issue2569(en_tokenizer):
|
def test_issue2569(en_tokenizer):
|
||||||
|
@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
|
||||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
||||||
deps = ["dep"] * len(heads)
|
deps = ["dep"] * len(heads)
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert doc[1].is_sent_start is None
|
assert doc[1].is_sent_start is False
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||||
|
|
|
@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
|
||||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||||
ents = [(2, 4, "PERCENT")]
|
ents = [(2, 4, "PERCENT")]
|
||||||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||||
assert doc.is_tagged
|
assert doc.has_annotation("TAG")
|
||||||
|
|
||||||
expected = ("10", "NUM", "CD", "PERCENT")
|
expected = ("10", "NUM", "CD", "PERCENT")
|
||||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||||||
|
@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
|
||||||
def test_issue3199():
|
def test_issue3199():
|
||||||
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
|
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
|
||||||
is available. To make this test future-proof, we're constructing a Doc
|
is available. To make this test future-proof, we're constructing a Doc
|
||||||
with a new Vocab here and setting is_parsed to make sure the noun chunks run.
|
with a new Vocab here and a parse tree to make sure the noun chunks run.
|
||||||
"""
|
"""
|
||||||
doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
|
doc = get_doc(
|
||||||
doc.is_parsed = True
|
Vocab(),
|
||||||
|
words=["This", "is", "a", "sentence"],
|
||||||
|
heads=[0, -1, -2, -3],
|
||||||
|
deps=["dep"] * 4,
|
||||||
|
)
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
assert list(doc[0:3].noun_chunks) == []
|
||||||
|
|
||||||
|
|
||||||
|
@ -250,16 +254,16 @@ def test_issue3456():
|
||||||
|
|
||||||
|
|
||||||
def test_issue3468():
|
def test_issue3468():
|
||||||
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can
|
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
|
||||||
be restored after serialization."""
|
be restored after serialization."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
doc = nlp("Hello world")
|
doc = nlp("Hello world")
|
||||||
assert doc[0].is_sent_start
|
assert doc[0].is_sent_start
|
||||||
assert doc.is_sentenced
|
assert doc.has_annotation("SENT_START")
|
||||||
assert len(list(doc.sents)) == 1
|
assert len(list(doc.sents)) == 1
|
||||||
doc_bytes = doc.to_bytes()
|
doc_bytes = doc.to_bytes()
|
||||||
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
|
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
|
||||||
assert new_doc[0].is_sent_start
|
assert new_doc[0].is_sent_start
|
||||||
assert new_doc.is_sentenced
|
assert new_doc.has_annotation("SENT_START")
|
||||||
assert len(list(new_doc.sents)) == 1
|
assert len(list(new_doc.sents)) == 1
|
||||||
|
|
|
@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
|
||||||
copy of the Doc.
|
copy of the Doc.
|
||||||
"""
|
"""
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
doc.is_parsed = True
|
|
||||||
doc.user_data["test"] = set()
|
doc.user_data["test"] = set()
|
||||||
parse_deps(doc)
|
parse_deps(doc)
|
||||||
|
|
||||||
|
@ -386,7 +385,6 @@ def test_issue3959():
|
||||||
doc[0].pos_ = "NOUN"
|
doc[0].pos_ = "NOUN"
|
||||||
assert doc[0].pos_ == "NOUN"
|
assert doc[0].pos_ == "NOUN"
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
doc.is_tagged = True
|
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
file_path = tmp_dir / "my_doc"
|
file_path = tmp_dir / "my_doc"
|
||||||
doc.to_disk(file_path)
|
doc.to_disk(file_path)
|
||||||
|
|
|
@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
token.pos_ = pos[i]
|
token.pos_ = pos[i]
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
doc.is_tagged = True
|
|
||||||
doc_bytes = doc.to_bytes()
|
doc_bytes = doc.to_bytes()
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab = vocab.from_bytes(vocab_bytes)
|
vocab = vocab.from_bytes(vocab_bytes)
|
||||||
|
@ -249,7 +248,7 @@ def test_issue4267():
|
||||||
assert "ner" in nlp.pipe_names
|
assert "ner" in nlp.pipe_names
|
||||||
# assert that we have correct IOB annotations
|
# assert that we have correct IOB annotations
|
||||||
doc1 = nlp("hi")
|
doc1 = nlp("hi")
|
||||||
assert doc1.is_nered
|
assert doc1.has_annotation("ENT_IOB")
|
||||||
for token in doc1:
|
for token in doc1:
|
||||||
assert token.ent_iob == 2
|
assert token.ent_iob == 2
|
||||||
# add entity ruler and run again
|
# add entity ruler and run again
|
||||||
|
@ -260,7 +259,7 @@ def test_issue4267():
|
||||||
assert "ner" in nlp.pipe_names
|
assert "ner" in nlp.pipe_names
|
||||||
# assert that we still have correct IOB annotations
|
# assert that we still have correct IOB annotations
|
||||||
doc2 = nlp("hi")
|
doc2 = nlp("hi")
|
||||||
assert doc2.is_nered
|
assert doc2.has_annotation("ENT_IOB")
|
||||||
for token in doc2:
|
for token in doc2:
|
||||||
assert token.ent_iob == 2
|
assert token.ent_iob == 2
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,9 @@ from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry
|
from spacy.util import registry
|
||||||
|
import spacy
|
||||||
|
|
||||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||||
|
|
||||||
|
@ -266,3 +268,34 @@ def test_language_custom_tokenizer():
|
||||||
assert [t.text for t in doc] == ["_hello", "_world"]
|
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||||
doc = list(nlp.pipe(["hello world"]))[0]
|
doc = list(nlp.pipe(["hello world"]))[0]
|
||||||
assert [t.text for t in doc] == ["_hello", "_world"]
|
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_language_from_config_invalid_lang():
|
||||||
|
"""Test that calling Language.from_config raises an error and lang defined
|
||||||
|
in config needs to match language-specific subclasses."""
|
||||||
|
config = {"nlp": {"lang": "en"}}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Language.from_config(config)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
German.from_config(config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_spacy_blank():
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
assert nlp.config["training"]["dropout"] == 0.1
|
||||||
|
config = {"training": {"dropout": 0.2}}
|
||||||
|
meta = {"name": "my_custom_model"}
|
||||||
|
nlp = spacy.blank("en", config=config, meta=meta)
|
||||||
|
assert nlp.config["training"]["dropout"] == 0.2
|
||||||
|
assert nlp.meta["name"] == "my_custom_model"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"value",
|
||||||
|
[False, None, ["x", "y"], Language, Vocab],
|
||||||
|
)
|
||||||
|
def test_language_init_invalid_vocab(value):
|
||||||
|
err_fragment = "invalid value"
|
||||||
|
with pytest.raises(ValueError) as e:
|
||||||
|
Language(value)
|
||||||
|
assert err_fragment in str(e.value)
|
||||||
|
|
|
@ -80,7 +80,6 @@ def tagged_doc():
|
||||||
doc[i].morph_ = morphs[i]
|
doc[i].morph_ = morphs[i]
|
||||||
if i > 0:
|
if i > 0:
|
||||||
doc[i].is_sent_start = False
|
doc[i].is_sent_start = False
|
||||||
doc.is_tagged = True
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
[
|
[
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -12,7 +12,7 @@ from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -26,24 +26,16 @@ def doc():
|
||||||
"NounType=prop|Number=sing", "PunctType=peri"]
|
"NounType=prop|Number=sing", "PunctType=peri"]
|
||||||
# head of '.' is intentionally nonprojective for testing
|
# head of '.' is intentionally nonprojective for testing
|
||||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||||
|
heads = [head - i for i, head in enumerate(heads)]
|
||||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||||
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
# fmt: on
|
# fmt: on
|
||||||
nlp = English()
|
nlp = English()
|
||||||
doc = nlp(text)
|
words = [t.text for t in nlp.make_doc(text)]
|
||||||
for i in range(len(tags)):
|
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
|
||||||
doc[i].tag_ = tags[i]
|
|
||||||
doc[i].pos_ = pos[i]
|
|
||||||
doc[i].morph_ = morphs[i]
|
|
||||||
doc[i].lemma_ = lemmas[i]
|
|
||||||
doc[i].dep_ = deps[i]
|
|
||||||
doc[i].head = doc[heads[i]]
|
|
||||||
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc.is_tagged = True
|
|
||||||
doc.is_parsed = True
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
|
||||||
docs = json2docs(data)
|
docs = json2docs(data)
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
assert not doc.is_nered
|
assert not doc.has_annotation("ENT_IOB")
|
||||||
for token in doc:
|
for token in doc:
|
||||||
assert token.ent_iob == 0
|
assert token.ent_iob == 0
|
||||||
eg = Example(
|
eg = Example(
|
||||||
|
|
|
@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
# Set the left/right children, left/right edges
|
# Set the left/right children, left/right edges
|
||||||
set_children_from_heads(doc.c, doc.length)
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
# Make sure ent_iob remains consistent
|
# Make sure ent_iob remains consistent
|
||||||
make_iob_consistent(doc.c, doc.length)
|
make_iob_consistent(doc.c, doc.length)
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
|
@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i].head -= i
|
doc.c[i].head -= i
|
||||||
# set children from head
|
# set children from head
|
||||||
set_children_from_heads(doc.c, doc.length)
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
|
|
||||||
|
|
||||||
def _validate_extensions(extensions):
|
def _validate_extensions(extensions):
|
||||||
|
@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
|
||||||
def normalize_token_attrs(Vocab vocab, attrs):
|
def normalize_token_attrs(Vocab vocab, attrs):
|
||||||
if "_" in attrs: # Extension attributes
|
if "_" in attrs: # Extension attributes
|
||||||
extensions = attrs["_"]
|
extensions = attrs["_"]
|
||||||
print("EXTENSIONS", extensions)
|
|
||||||
_validate_extensions(extensions)
|
_validate_extensions(extensions)
|
||||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||||
attrs = intify_attrs(attrs, strings_map=vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=vocab.strings)
|
||||||
|
|
|
@ -13,7 +13,7 @@ from ..errors import Errors
|
||||||
from ..util import ensure_path, SimpleFrozenList
|
from ..util import ensure_path, SimpleFrozenList
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
|
||||||
const_TokenC_ptr
|
const_TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
|
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||||
|
@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
|
||||||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
|
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
@ -49,10 +46,6 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef TokenC* c
|
cdef TokenC* c
|
||||||
|
|
||||||
cdef public bint is_tagged
|
|
||||||
cdef public bint is_parsed
|
|
||||||
cdef public bint is_morphed
|
|
||||||
|
|
||||||
cdef public float sentiment
|
cdef public float sentiment
|
||||||
|
|
||||||
cdef public dict user_hooks
|
cdef public dict user_hooks
|
||||||
|
@ -74,5 +67,3 @@ cdef class Doc:
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
||||||
cdef void set_parse(self, const TokenC* parsed) nogil
|
|
||||||
|
|
|
@ -1,37 +1,34 @@
|
||||||
# cython: infer_types=True, bounds_check=False, profile=True
|
# cython: infer_types=True, bounds_check=False, profile=True
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
from libc.stdint cimport int32_t, uint64_t
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.linalg
|
|
||||||
import struct
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
from thinc.util import copy_array
|
from thinc.util import copy_array
|
||||||
import warnings
|
import warnings
|
||||||
import copy
|
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
|
||||||
|
|
||||||
from ..attrs import intify_attr, intify_attrs, IDS
|
from ..attrs import intify_attr, IDS
|
||||||
from ..util import normalize_slice
|
|
||||||
from ..compat import copy_reg, pickle
|
from ..compat import copy_reg, pickle
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
from ..morphology import Morphology
|
||||||
from .. import util
|
from .. import util
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
from ._retokenize import Retokenizer
|
from ._retokenize import Retokenizer
|
||||||
|
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -190,8 +187,6 @@ cdef class Doc:
|
||||||
self.c = data_start + PADDING
|
self.c = data_start + PADDING
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
self.is_tagged = False
|
|
||||||
self.is_parsed = False
|
|
||||||
self.sentiment = 0.0
|
self.sentiment = 0.0
|
||||||
self.cats = {}
|
self.cats = {}
|
||||||
self.user_hooks = {}
|
self.user_hooks = {}
|
||||||
|
@ -221,11 +216,6 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
|
||||||
# There's no information we'd like to add to it, so I guess so?
|
|
||||||
if self.length == 0:
|
|
||||||
self.is_tagged = True
|
|
||||||
self.is_parsed = True
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
|
@ -233,37 +223,61 @@ cdef class Doc:
|
||||||
return Underscore(Underscore.doc_extensions, self)
|
return Underscore(Underscore.doc_extensions, self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_sentenced(self):
|
def is_tagged(self):
|
||||||
"""Check if the document has sentence boundaries assigned. This is
|
warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
|
||||||
defined as having at least one of the following:
|
return self.has_annotation("TAG")
|
||||||
|
|
||||||
a) An entry "sents" in doc.user_hooks";
|
@property
|
||||||
b) Doc.is_parsed is set to True;
|
def is_parsed(self):
|
||||||
c) At least one token other than the first where sent_start is not None.
|
warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
|
||||||
"""
|
return self.has_annotation("DEP")
|
||||||
if "sents" in self.user_hooks:
|
|
||||||
return True
|
|
||||||
if self.is_parsed:
|
|
||||||
return True
|
|
||||||
if len(self) < 2:
|
|
||||||
return True
|
|
||||||
for i in range(1, self.length):
|
|
||||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_nered(self):
|
def is_nered(self):
|
||||||
"""Check if the document has named entities set. Will return True if
|
warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
|
||||||
*any* of the tokens has a named entity tag set (even if the others are
|
return self.has_annotation("ENT_IOB")
|
||||||
unknown values), or if the document is empty.
|
|
||||||
|
@property
|
||||||
|
def is_sentenced(self):
|
||||||
|
warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
|
||||||
|
return self.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
def has_annotation(self, attr, *, require_complete=False):
|
||||||
|
"""Check whether the doc contains annotation on a token attribute.
|
||||||
|
|
||||||
|
attr (Union[int, str]): The attribute string name or int ID.
|
||||||
|
require_complete (bool): Whether to check that the attribute is set on
|
||||||
|
every token in the doc.
|
||||||
|
RETURNS (bool): Whether annotation is present.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/doc#has_annotation
|
||||||
"""
|
"""
|
||||||
if len(self) == 0:
|
|
||||||
|
# empty docs are always annotated
|
||||||
|
if self.length == 0:
|
||||||
return True
|
return True
|
||||||
for i in range(self.length):
|
cdef int i
|
||||||
if self.c[i].ent_iob != 0:
|
cdef int range_start = 0
|
||||||
|
attr = intify_attr(attr)
|
||||||
|
# adjust attributes
|
||||||
|
if attr == HEAD:
|
||||||
|
# HEAD does not have an unset state, so rely on DEP
|
||||||
|
attr = DEP
|
||||||
|
elif attr == self.vocab.strings["IS_SENT_START"]:
|
||||||
|
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
|
||||||
|
attr = SENT_START
|
||||||
|
# special cases for sentence boundaries
|
||||||
|
if attr == SENT_START:
|
||||||
|
if "sents" in self.user_hooks:
|
||||||
return True
|
return True
|
||||||
return False
|
# docs of length 1 always have sentence boundaries
|
||||||
|
if self.length == 1:
|
||||||
|
return True
|
||||||
|
range_start = 1
|
||||||
|
if require_complete:
|
||||||
|
return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
||||||
|
else:
|
||||||
|
return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
@ -291,7 +305,7 @@ cdef class Doc:
|
||||||
DOCS: https://nightly.spacy.io/api/doc#getitem
|
DOCS: https://nightly.spacy.io/api/doc#getitem
|
||||||
"""
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
return Span(self, start, stop, label=0)
|
return Span(self, start, stop, label=0)
|
||||||
if i < 0:
|
if i < 0:
|
||||||
i = self.length + i
|
i = self.length + i
|
||||||
|
@ -627,16 +641,13 @@ cdef class Doc:
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||||
objects. Sentence spans have no label. To improve accuracy on informal
|
objects. Sentence spans have no label.
|
||||||
texts, spaCy calculates sentence boundaries from the syntactic
|
|
||||||
dependency parse. If the parser is disabled, the `sents` iterator will
|
|
||||||
be unavailable.
|
|
||||||
|
|
||||||
YIELDS (Span): Sentences in the document.
|
YIELDS (Span): Sentences in the document.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#sents
|
DOCS: https://nightly.spacy.io/api/doc#sents
|
||||||
"""
|
"""
|
||||||
if not self.is_sentenced:
|
if not self.has_annotation("SENT_START"):
|
||||||
raise ValueError(Errors.E030)
|
raise ValueError(Errors.E030)
|
||||||
if "sents" in self.user_hooks:
|
if "sents" in self.user_hooks:
|
||||||
yield from self.user_hooks["sents"](self)
|
yield from self.user_hooks["sents"](self)
|
||||||
|
@ -660,10 +671,6 @@ cdef class Doc:
|
||||||
return self.vocab.lang
|
return self.vocab.lang
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||||
if self.length == 0:
|
|
||||||
# Flip these to false when we see the first token.
|
|
||||||
self.is_tagged = False
|
|
||||||
self.is_parsed = False
|
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
cdef TokenC* t = &self.c[self.length]
|
cdef TokenC* t = &self.c[self.length]
|
||||||
|
@ -786,14 +793,6 @@ cdef class Doc:
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
self.c[i].lex = &EMPTY_LEXEME
|
self.c[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
cdef void set_parse(self, const TokenC* parsed) nogil:
|
|
||||||
# TODO: This method is fairly misleading atm. It's used by Parser
|
|
||||||
# to actually apply the parse calculated. Need to rethink this.
|
|
||||||
# Probably we should use from_array?
|
|
||||||
self.is_parsed = True
|
|
||||||
for i in range(self.length):
|
|
||||||
self.c[i] = parsed[i]
|
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||||
`(M, N)` array of attributes.
|
`(M, N)` array of attributes.
|
||||||
|
@ -818,8 +817,8 @@ cdef class Doc:
|
||||||
if array.dtype != numpy.uint64:
|
if array.dtype != numpy.uint64:
|
||||||
warnings.warn(Warnings.W028.format(type=array.dtype))
|
warnings.warn(Warnings.W028.format(type=array.dtype))
|
||||||
|
|
||||||
if SENT_START in attrs and HEAD in attrs:
|
if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
|
||||||
raise ValueError(Errors.E032)
|
warnings.warn(Warnings.W106)
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef int32_t abs_head_index
|
cdef int32_t abs_head_index
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
|
@ -879,18 +878,17 @@ cdef class Doc:
|
||||||
# add morph to morphology table
|
# add morph to morphology table
|
||||||
self.vocab.morphology.add(self.vocab.strings[value])
|
self.vocab.morphology.add(self.vocab.strings[value])
|
||||||
Token.set_struct_attr(token, attr_ids[j], value)
|
Token.set_struct_attr(token, attr_ids[j], value)
|
||||||
# Set flags
|
# If document is parsed, set children and sentence boundaries
|
||||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
if HEAD in attrs and DEP in attrs:
|
||||||
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
col = attrs.index(DEP)
|
||||||
# If document is parsed, set children
|
if array[:, col].any():
|
||||||
if self.is_parsed:
|
set_children_from_heads(self.c, 0, length)
|
||||||
set_children_from_heads(self.c, length)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_docs(docs, ensure_whitespace=True, attrs=None):
|
def from_docs(docs, ensure_whitespace=True, attrs=None):
|
||||||
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
|
"""Concatenate multiple Doc objects to form a new one. Raises an error
|
||||||
the same `Vocab`.
|
if the `Doc` objects do not all share the same `Vocab`.
|
||||||
|
|
||||||
docs (list): A list of Doc objects.
|
docs (list): A list of Doc objects.
|
||||||
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
|
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
|
||||||
|
@ -908,16 +906,7 @@ cdef class Doc:
|
||||||
(vocab,) = vocab
|
(vocab,) = vocab
|
||||||
|
|
||||||
if attrs is None:
|
if attrs is None:
|
||||||
attrs = [LEMMA, NORM]
|
attrs = Doc._get_array_attrs()
|
||||||
if all(doc.is_nered for doc in docs):
|
|
||||||
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
|
|
||||||
# TODO: separate for is_morphed?
|
|
||||||
if all(doc.is_tagged for doc in docs):
|
|
||||||
attrs.extend([TAG, POS, MORPH])
|
|
||||||
if all(doc.is_parsed for doc in docs):
|
|
||||||
attrs.extend([HEAD, DEP])
|
|
||||||
else:
|
|
||||||
attrs.append(SENT_START)
|
|
||||||
else:
|
else:
|
||||||
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
|
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
|
||||||
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
|
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
|
||||||
|
@ -989,9 +978,6 @@ cdef class Doc:
|
||||||
other.tensor = copy.deepcopy(self.tensor)
|
other.tensor = copy.deepcopy(self.tensor)
|
||||||
other.cats = copy.deepcopy(self.cats)
|
other.cats = copy.deepcopy(self.cats)
|
||||||
other.user_data = copy.deepcopy(self.user_data)
|
other.user_data = copy.deepcopy(self.user_data)
|
||||||
other.is_tagged = self.is_tagged
|
|
||||||
other.is_parsed = self.is_parsed
|
|
||||||
other.is_morphed = self.is_morphed
|
|
||||||
other.sentiment = self.sentiment
|
other.sentiment = self.sentiment
|
||||||
other.has_unknown_spaces = self.has_unknown_spaces
|
other.has_unknown_spaces = self.has_unknown_spaces
|
||||||
other.user_hooks = dict(self.user_hooks)
|
other.user_hooks = dict(self.user_hooks)
|
||||||
|
@ -1065,22 +1051,16 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
|
array_head = Doc._get_array_attrs()
|
||||||
if self.is_tagged:
|
|
||||||
array_head.extend([TAG, POS])
|
|
||||||
# If doc parsed add head and dep attribute
|
|
||||||
if self.is_parsed:
|
|
||||||
array_head.extend([HEAD, DEP])
|
|
||||||
# Otherwise add sent_start
|
|
||||||
else:
|
|
||||||
array_head.append(SENT_START)
|
|
||||||
strings = set()
|
strings = set()
|
||||||
for token in self:
|
for token in self:
|
||||||
strings.add(token.tag_)
|
strings.add(token.tag_)
|
||||||
strings.add(token.lemma_)
|
strings.add(token.lemma_)
|
||||||
|
strings.add(token.morph_)
|
||||||
strings.add(token.dep_)
|
strings.add(token.dep_)
|
||||||
strings.add(token.ent_type_)
|
strings.add(token.ent_type_)
|
||||||
strings.add(token.ent_kb_id_)
|
strings.add(token.ent_kb_id_)
|
||||||
|
strings.add(token.ent_id_)
|
||||||
strings.add(token.norm_)
|
strings.add(token.norm_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
|
@ -1230,22 +1210,29 @@ cdef class Doc:
|
||||||
DOCS: https://nightly.spacy.io/api/doc#to_json
|
DOCS: https://nightly.spacy.io/api/doc#to_json
|
||||||
"""
|
"""
|
||||||
data = {"text": self.text}
|
data = {"text": self.text}
|
||||||
if self.is_nered:
|
if self.has_annotation("ENT_IOB"):
|
||||||
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
||||||
"label": ent.label_} for ent in self.ents]
|
"label": ent.label_} for ent in self.ents]
|
||||||
if self.is_sentenced:
|
if self.has_annotation("SENT_START"):
|
||||||
sents = list(self.sents)
|
sents = list(self.sents)
|
||||||
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
||||||
for sent in sents]
|
for sent in sents]
|
||||||
if self.cats:
|
if self.cats:
|
||||||
data["cats"] = self.cats
|
data["cats"] = self.cats
|
||||||
data["tokens"] = []
|
data["tokens"] = []
|
||||||
|
attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
|
||||||
|
include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
|
||||||
for token in self:
|
for token in self:
|
||||||
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
||||||
if self.is_tagged:
|
if include_annotation["TAG"]:
|
||||||
token_data["pos"] = token.pos_
|
|
||||||
token_data["tag"] = token.tag_
|
token_data["tag"] = token.tag_
|
||||||
if self.is_parsed:
|
if include_annotation["POS"]:
|
||||||
|
token_data["pos"] = token.pos_
|
||||||
|
if include_annotation["MORPH"]:
|
||||||
|
token_data["morph"] = token.morph_
|
||||||
|
if include_annotation["LEMMA"]:
|
||||||
|
token_data["lemma"] = token.lemma_
|
||||||
|
if include_annotation["DEP"]:
|
||||||
token_data["dep"] = token.dep_
|
token_data["dep"] = token.dep_
|
||||||
token_data["head"] = token.head.i
|
token_data["head"] = token.head.i
|
||||||
data["tokens"].append(token_data)
|
data["tokens"].append(token_data)
|
||||||
|
@ -1291,6 +1278,12 @@ cdef class Doc:
|
||||||
j += 1
|
j += 1
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_array_attrs():
|
||||||
|
attrs = [LENGTH, SPACY]
|
||||||
|
attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
|
||||||
|
return tuple(attrs)
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||||
cdef int i = token_by_char(tokens, length, start_char)
|
cdef int i = token_by_char(tokens, length, start_char)
|
||||||
|
@ -1321,13 +1314,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
|
||||||
return mid
|
return mid
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
# note: end is exclusive
|
||||||
cdef TokenC* head
|
cdef TokenC* head
|
||||||
cdef TokenC* child
|
cdef TokenC* child
|
||||||
cdef int i
|
cdef int i
|
||||||
# Set number of left/right children to 0. We'll increment it in the loops.
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||||
for i in range(length):
|
for i in range(start, end):
|
||||||
tokens[i].l_kids = 0
|
tokens[i].l_kids = 0
|
||||||
tokens[i].r_kids = 0
|
tokens[i].r_kids = 0
|
||||||
tokens[i].l_edge = i
|
tokens[i].l_edge = i
|
||||||
|
@ -1341,38 +1334,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
# without risking getting stuck in an infinite loop if something is
|
# without risking getting stuck in an infinite loop if something is
|
||||||
# terribly malformed.
|
# terribly malformed.
|
||||||
while not heads_within_sents:
|
while not heads_within_sents:
|
||||||
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
|
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
|
||||||
if loop_count > 10:
|
if loop_count > 10:
|
||||||
warnings.warn(Warnings.W026)
|
warnings.warn(Warnings.W026)
|
||||||
break
|
break
|
||||||
loop_count += 1
|
loop_count += 1
|
||||||
# Set sentence starts
|
# Set sentence starts
|
||||||
for i in range(length):
|
for i in range(start, end):
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
tokens[i].sent_start = -1
|
||||||
tokens[tokens[i].l_edge].sent_start = True
|
for i in range(start, end):
|
||||||
|
if tokens[i].head == 0:
|
||||||
|
tokens[tokens[i].l_edge].sent_start = 1
|
||||||
|
|
||||||
|
|
||||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
|
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
|
||||||
# May be called multiple times due to non-projectivity. See issues #3170
|
# May be called multiple times due to non-projectivity. See issues #3170
|
||||||
# and #4688.
|
# and #4688.
|
||||||
# Set left edges
|
# Set left edges
|
||||||
cdef TokenC* head
|
cdef TokenC* head
|
||||||
cdef TokenC* child
|
cdef TokenC* child
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(length):
|
for i in range(start, end):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
head = &tokens[i + child.head]
|
head = &tokens[i + child.head]
|
||||||
if child < head and loop_count == 0:
|
if loop_count == 0 and child < head:
|
||||||
head.l_kids += 1
|
head.l_kids += 1
|
||||||
if child.l_edge < head.l_edge:
|
if child.l_edge < head.l_edge:
|
||||||
head.l_edge = child.l_edge
|
head.l_edge = child.l_edge
|
||||||
if child.r_edge > head.r_edge:
|
if child.r_edge > head.r_edge:
|
||||||
head.r_edge = child.r_edge
|
head.r_edge = child.r_edge
|
||||||
# Set right edges - same as above, but iterate in reverse
|
# Set right edges - same as above, but iterate in reverse
|
||||||
for i in range(length-1, -1, -1):
|
for i in range(end-1, start-1, -1):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
head = &tokens[i + child.head]
|
head = &tokens[i + child.head]
|
||||||
if child > head and loop_count == 0:
|
if loop_count == 0 and child > head:
|
||||||
head.r_kids += 1
|
head.r_kids += 1
|
||||||
if child.r_edge > head.r_edge:
|
if child.r_edge > head.r_edge:
|
||||||
head.r_edge = child.r_edge
|
head.r_edge = child.r_edge
|
||||||
|
@ -1380,14 +1375,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
|
||||||
head.l_edge = child.l_edge
|
head.l_edge = child.l_edge
|
||||||
# Get sentence start positions according to current state
|
# Get sentence start positions according to current state
|
||||||
sent_starts = set()
|
sent_starts = set()
|
||||||
for i in range(length):
|
for i in range(start, end):
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
if tokens[i].head == 0:
|
||||||
sent_starts.add(tokens[i].l_edge)
|
sent_starts.add(tokens[i].l_edge)
|
||||||
cdef int curr_sent_start = 0
|
cdef int curr_sent_start = 0
|
||||||
cdef int curr_sent_end = 0
|
cdef int curr_sent_end = 0
|
||||||
# Check whether any heads are not within the current sentence
|
# Check whether any heads are not within the current sentence
|
||||||
for i in range(length):
|
for i in range(start, end):
|
||||||
if (i > 0 and i in sent_starts) or i == length - 1:
|
if (i > 0 and i in sent_starts) or i == end - 1:
|
||||||
curr_sent_end = i
|
curr_sent_end = i
|
||||||
for j in range(curr_sent_start, curr_sent_end):
|
for j in range(curr_sent_start, curr_sent_end):
|
||||||
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
||||||
|
@ -1436,6 +1431,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
with shape (n, n), where n = len(doc).
|
with shape (n, n), where n = len(doc).
|
||||||
"""
|
"""
|
||||||
cdef int [:,:] lca_matrix
|
cdef int [:,:] lca_matrix
|
||||||
|
cdef int j, k
|
||||||
n_tokens= end - start
|
n_tokens= end - start
|
||||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||||
lca_mat.fill(-1)
|
lca_mat.fill(-1)
|
||||||
|
|
|
@ -4,13 +4,10 @@ cimport numpy as np
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.linalg
|
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
from collections import defaultdict
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
|
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
|
||||||
from .token cimport TokenC
|
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -204,7 +201,7 @@ cdef class Span:
|
||||||
return Underscore(Underscore.span_extensions, self,
|
return Underscore(Underscore.span_extensions, self,
|
||||||
start=self.start_char, end=self.end_char)
|
start=self.start_char, end=self.end_char)
|
||||||
|
|
||||||
def as_doc(self, bint copy_user_data=False):
|
def as_doc(self, *, bint copy_user_data=False):
|
||||||
"""Create a `Doc` object with a copy of the `Span`'s data.
|
"""Create a `Doc` object with a copy of the `Span`'s data.
|
||||||
|
|
||||||
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
||||||
|
@ -212,19 +209,10 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#as_doc
|
DOCS: https://nightly.spacy.io/api/span#as_doc
|
||||||
"""
|
"""
|
||||||
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
|
|
||||||
words = [t.text for t in self]
|
words = [t.text for t in self]
|
||||||
spaces = [bool(t.whitespace_) for t in self]
|
spaces = [bool(t.whitespace_) for t in self]
|
||||||
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
|
array_head = self.doc._get_array_attrs()
|
||||||
if self.doc.is_tagged:
|
|
||||||
array_head.append(TAG)
|
|
||||||
# If doc parsed add head and dep attribute
|
|
||||||
if self.doc.is_parsed:
|
|
||||||
array_head.extend([HEAD, DEP])
|
|
||||||
# Otherwise add sent_start
|
|
||||||
else:
|
|
||||||
array_head.append(SENT_START)
|
|
||||||
array = self.doc.to_array(array_head)
|
array = self.doc.to_array(array_head)
|
||||||
array = array[self.start : self.end]
|
array = array[self.start : self.end]
|
||||||
self._fix_dep_copy(array_head, array)
|
self._fix_dep_copy(array_head, array)
|
||||||
|
@ -378,7 +366,7 @@ cdef class Span:
|
||||||
self.doc.sents
|
self.doc.sents
|
||||||
# Use `sent_start` token attribute to find sentence boundaries
|
# Use `sent_start` token attribute to find sentence boundaries
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
if self.doc.is_sentenced:
|
if self.doc.has_annotation("SENT_START"):
|
||||||
# Find start of the sentence
|
# Find start of the sentence
|
||||||
start = self.start
|
start = self.start
|
||||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||||
|
@ -510,8 +498,6 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||||
"""
|
"""
|
||||||
if not self.doc.is_parsed:
|
|
||||||
raise ValueError(Errors.E029)
|
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# prevents the tokenisation from being changed out from under us
|
# prevents the tokenisation from being changed out from under us
|
||||||
# during the iteration. The tricky thing here is that Span accepts
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from libc.string cimport memcpy
|
|
||||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
from cython.view cimport array as cvarray
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
|
||||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
|
||||||
from ..symbols cimport conj
|
from ..symbols cimport conj
|
||||||
from .morphanalysis cimport MorphAnalysis
|
from .morphanalysis cimport MorphAnalysis
|
||||||
|
from .doc cimport set_children_from_heads
|
||||||
|
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
from .. import util
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
@ -489,7 +486,7 @@ cdef class Token:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
if self.doc.is_parsed:
|
if self.doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E043)
|
raise ValueError(Errors.E043)
|
||||||
if value is None:
|
if value is None:
|
||||||
self.c.sent_start = 0
|
self.c.sent_start = 0
|
||||||
|
@ -658,78 +655,19 @@ cdef class Token:
|
||||||
# Do nothing if old head is new head
|
# Do nothing if old head is new head
|
||||||
if self.i + self.c.head == new_head.i:
|
if self.i + self.c.head == new_head.i:
|
||||||
return
|
return
|
||||||
cdef Token old_head = self.head
|
# Find the widest l/r_edges of the roots of the two tokens involved
|
||||||
cdef int rel_newhead_i = new_head.i - self.i
|
# to limit the number of tokens for set_children_from_heads
|
||||||
# Is the new head a descendant of the old head
|
cdef Token self_root, new_head_root
|
||||||
cdef bint is_desc = old_head.is_ancestor(new_head)
|
self_ancestors = list(self.ancestors)
|
||||||
cdef int new_edge
|
new_head_ancestors = list(new_head.ancestors)
|
||||||
cdef Token anc, child
|
self_root = self_ancestors[-1] if self_ancestors else self
|
||||||
# Update number of deps of old head
|
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
|
||||||
if self.c.head > 0: # left dependent
|
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
|
||||||
old_head.c.l_kids -= 1
|
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
|
||||||
if self.c.l_edge == old_head.c.l_edge:
|
|
||||||
# The token dominates the left edge so the left edge of
|
|
||||||
# the head may change when the token is reattached, it may
|
|
||||||
# not change if the new head is a descendant of the current
|
|
||||||
# head.
|
|
||||||
new_edge = self.c.l_edge
|
|
||||||
# The new l_edge is the left-most l_edge on any of the
|
|
||||||
# other dependents where the l_edge is left of the head,
|
|
||||||
# otherwise it is the head
|
|
||||||
if not is_desc:
|
|
||||||
new_edge = old_head.i
|
|
||||||
for child in old_head.children:
|
|
||||||
if child == self:
|
|
||||||
continue
|
|
||||||
if child.c.l_edge < new_edge:
|
|
||||||
new_edge = child.c.l_edge
|
|
||||||
old_head.c.l_edge = new_edge
|
|
||||||
# Walk up the tree from old_head and assign new l_edge to
|
|
||||||
# ancestors until an ancestor already has an l_edge that's
|
|
||||||
# further left
|
|
||||||
for anc in old_head.ancestors:
|
|
||||||
if anc.c.l_edge <= new_edge:
|
|
||||||
break
|
|
||||||
anc.c.l_edge = new_edge
|
|
||||||
elif self.c.head < 0: # right dependent
|
|
||||||
old_head.c.r_kids -= 1
|
|
||||||
# Do the same thing as for l_edge
|
|
||||||
if self.c.r_edge == old_head.c.r_edge:
|
|
||||||
new_edge = self.c.r_edge
|
|
||||||
if not is_desc:
|
|
||||||
new_edge = old_head.i
|
|
||||||
for child in old_head.children:
|
|
||||||
if child == self:
|
|
||||||
continue
|
|
||||||
if child.c.r_edge > new_edge:
|
|
||||||
new_edge = child.c.r_edge
|
|
||||||
old_head.c.r_edge = new_edge
|
|
||||||
for anc in old_head.ancestors:
|
|
||||||
if anc.c.r_edge >= new_edge:
|
|
||||||
break
|
|
||||||
anc.c.r_edge = new_edge
|
|
||||||
# Update number of deps of new head
|
|
||||||
if rel_newhead_i > 0: # left dependent
|
|
||||||
new_head.c.l_kids += 1
|
|
||||||
# Walk up the tree from new head and set l_edge to self.l_edge
|
|
||||||
# until you hit a token with an l_edge further to the left
|
|
||||||
if self.c.l_edge < new_head.c.l_edge:
|
|
||||||
new_head.c.l_edge = self.c.l_edge
|
|
||||||
for anc in new_head.ancestors:
|
|
||||||
if anc.c.l_edge <= self.c.l_edge:
|
|
||||||
break
|
|
||||||
anc.c.l_edge = self.c.l_edge
|
|
||||||
elif rel_newhead_i < 0: # right dependent
|
|
||||||
new_head.c.r_kids += 1
|
|
||||||
# Do the same as for l_edge
|
|
||||||
if self.c.r_edge > new_head.c.r_edge:
|
|
||||||
new_head.c.r_edge = self.c.r_edge
|
|
||||||
for anc in new_head.ancestors:
|
|
||||||
if anc.c.r_edge >= self.c.r_edge:
|
|
||||||
break
|
|
||||||
anc.c.r_edge = self.c.r_edge
|
|
||||||
# Set new head
|
# Set new head
|
||||||
self.c.head = rel_newhead_i
|
self.c.head = new_head.i - self.i
|
||||||
|
# Adjust parse properties and sentence starts
|
||||||
|
set_children_from_heads(self.doc.c, start, end + 1)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def conjuncts(self):
|
def conjuncts(self):
|
||||||
|
|
|
@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
|
||||||
doc[i]._.merged_spaceafter = spaces[i]
|
doc[i]._.merged_spaceafter = spaces[i]
|
||||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
doc.ents = spans_from_biluo_tags(doc, ents)
|
||||||
doc.is_parsed = True
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
if merge_subtokens:
|
if merge_subtokens:
|
||||||
doc = merge_conllu_subtokens(lines, doc)
|
doc = merge_conllu_subtokens(lines, doc)
|
||||||
|
@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
|
||||||
doc_x[i].dep_ = deps[i]
|
doc_x[i].dep_ = deps[i]
|
||||||
doc_x[i].head = doc_x[heads[i]]
|
doc_x[i].head = doc_x[heads[i]]
|
||||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||||
doc_x.is_parsed = True
|
|
||||||
doc_x.is_tagged = True
|
|
||||||
|
|
||||||
return doc_x
|
return doc_x
|
||||||
|
|
||||||
|
|
|
@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||||
json_para["links"].append(link_dict)
|
json_para["links"].append(link_dict)
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||||
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||||
|
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||||
if doc.is_tagged:
|
if include_annotation["TAG"]:
|
||||||
json_token["tag"] = token.tag_
|
json_token["tag"] = token.tag_
|
||||||
|
if include_annotation["POS"]:
|
||||||
json_token["pos"] = token.pos_
|
json_token["pos"] = token.pos_
|
||||||
|
if include_annotation["MORPH"]:
|
||||||
json_token["morph"] = token.morph_
|
json_token["morph"] = token.morph_
|
||||||
|
if include_annotation["LEMMA"]:
|
||||||
json_token["lemma"] = token.lemma_
|
json_token["lemma"] = token.lemma_
|
||||||
if doc.is_parsed:
|
if include_annotation["DEP"]:
|
||||||
json_token["head"] = token.head.i-token.i
|
json_token["head"] = token.head.i-token.i
|
||||||
json_token["dep"] = token.dep_
|
json_token["dep"] = token.dep_
|
||||||
json_token["ner"] = biluo_tags[token.i]
|
if include_annotation["ENT_IOB"]:
|
||||||
|
json_token["ner"] = biluo_tags[token.i]
|
||||||
json_sent["tokens"].append(json_token)
|
json_sent["tokens"].append(json_token)
|
||||||
json_para["sentences"].append(json_sent)
|
json_para["sentences"].append(json_sent)
|
||||||
json_doc["paragraphs"].append(json_para)
|
json_doc["paragraphs"].append(json_para)
|
||||||
|
|
|
@ -72,7 +72,7 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
|
||||||
for field in remove_config_values:
|
for field in remove_config_values:
|
||||||
del config_dot[field]
|
del config_dot[field]
|
||||||
config = util.dot_to_dict(config_dot)
|
config = util.dot_to_dict(config_dot)
|
||||||
wandb.init(project=project_name, config=config)
|
wandb.init(project=project_name, config=config, reinit=True)
|
||||||
console_log_step, console_finalize = console(nlp)
|
console_log_step, console_finalize = console(nlp)
|
||||||
|
|
||||||
def log_step(info: Dict[str, Any]):
|
def log_step(info: Dict[str, Any]):
|
||||||
|
@ -88,7 +88,7 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
|
||||||
|
|
||||||
def finalize():
|
def finalize():
|
||||||
console_finalize()
|
console_finalize()
|
||||||
pass
|
wandb.join()
|
||||||
|
|
||||||
return log_step, finalize
|
return log_step, finalize
|
||||||
|
|
||||||
|
|
|
@ -121,18 +121,19 @@ customize those settings in your config file later.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
|
$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] [--pretraining]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||||
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| **CREATES** | The config file for training. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | The config file for training. |
|
||||||
|
|
||||||
### init fill-config {#init-fill-config new="3"}
|
### init fill-config {#init-fill-config new="3"}
|
||||||
|
|
||||||
|
@ -160,13 +161,14 @@ validation error with more details.
|
||||||
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
|
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
|
||||||
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
|
||||||
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
|
| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Complete and auto-filled config file for training. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
### init vocab {#init-vocab new="3" tag="command"}
|
### init vocab {#init-vocab new="3" tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
||||||
| ----------- | -------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------- |
|
||||||
| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
||||||
|
|
||||||
|
## Doc.has_annotation {#has_annotation tag="method"}
|
||||||
|
|
||||||
|
Check whether the doc contains annotation on a token attribute.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------ | --------------------------------------------------------------------------------------------------- |
|
||||||
|
| `attr` | The attribute string name or int ID. ~~Union[int, str]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **RETURNS** | Whether specified annotation is present in the doc. ~~bool~~ |
|
||||||
|
|
||||||
## Doc.to_array {#to_array tag="method"}
|
## Doc.to_array {#to_array tag="method"}
|
||||||
|
|
||||||
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
|
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
|
||||||
|
@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `text` | A string representation of the document text. ~~str~~ |
|
| `text` | A string representation of the document text. ~~str~~ |
|
||||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||||
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||||
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
||||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||||
| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
||||||
| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||||
| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||||
| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
|
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
|
||||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
|
||||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
|
||||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,10 @@ return it.
|
||||||
|
|
||||||
## Language.\_\_init\_\_ {#init tag="method"}
|
## Language.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
Initialize a `Language` object.
|
Initialize a `Language` object. Note that the `meta` is only used for meta
|
||||||
|
information in [`Language.meta`](/api/language#meta) and not to configure the
|
||||||
|
`nlp` object or to override the config. To initialize from a config, use
|
||||||
|
[`Language.from_config`](/api/language#from_config) instead.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -37,7 +40,7 @@ Initialize a `Language` object.
|
||||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||||
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ |
|
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||||
|
|
||||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||||
|
@ -58,14 +61,17 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
||||||
> nlp = Language.from_config(config)
|
> nlp = Language.from_config(config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ |
|
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||||
|
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||||
|
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||||
|
|
||||||
## Language.component {#component tag="classmethod" new="3"}
|
## Language.component {#component tag="classmethod" new="3"}
|
||||||
|
|
||||||
|
@ -797,10 +803,19 @@ token.ent_iob, token.ent_type
|
||||||
|
|
||||||
## Language.meta {#meta tag="property"}
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
Custom meta data for the Language class. If a trained pipeline is loaded, this
|
Meta data for the `Language` class, including name, version, data sources,
|
||||||
|
license, author information and more. If a trained pipeline is loaded, this
|
||||||
contains meta data of the pipeline. The `Language.meta` is also what's
|
contains meta data of the pipeline. The `Language.meta` is also what's
|
||||||
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
|
serialized as the `meta.json` when you save an `nlp` object to disk. See the
|
||||||
object to disk.
|
[meta data format](/api/data-formats#meta) for more details.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
As of v3.0, the meta only contains **meta information** about the pipeline and
|
||||||
|
isn't used to construct the language class and pipeline components. This
|
||||||
|
information is expressed in the [`config.cfg`](/api/data-formats#config).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -78,10 +78,14 @@ Create a blank pipeline of a given language class. This function is the twin of
|
||||||
> nlp_de = spacy.blank("de") # equivalent to German()
|
> nlp_de = spacy.blank("de") # equivalent to German()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
||||||
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
|
| _keyword-only_ | |
|
||||||
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
|
| `meta` <Tag variant="new">3</tag> | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
|
||||||
|
|
||||||
### spacy.info {#spacy.info tag="function"}
|
### spacy.info {#spacy.info tag="function"}
|
||||||
|
|
||||||
|
@ -744,14 +748,14 @@ and create a `Language` object. The model data will then be loaded in via
|
||||||
> nlp = util.load_model("/path/to/data")
|
> nlp = util.load_model("/path/to/data")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `name` | Package name or path. ~~str~~ |
|
| `name` | Package name or path. ~~str~~ |
|
||||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -170,27 +170,52 @@ Compared to regular install via pip, the
|
||||||
developer dependencies such as Cython. See the [quickstart widget](#quickstart)
|
developer dependencies such as Cython. See the [quickstart widget](#quickstart)
|
||||||
to get the right commands for your platform and Python version.
|
to get the right commands for your platform and Python version.
|
||||||
|
|
||||||
#### Ubuntu {#source-ubuntu}
|
<a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a>
|
||||||
|
|
||||||
Install system-level dependencies via `apt-get`:
|
- **Ubuntu:** Install system-level dependencies via `apt-get`:
|
||||||
|
`sudo apt-get install build-essential python-dev git`
|
||||||
|
- **macOS / OS X:** Install a recent version of
|
||||||
|
[XCode](https://developer.apple.com/xcode/), including the so-called "Command
|
||||||
|
Line Tools". macOS and OS X ship with Python and git preinstalled.
|
||||||
|
- **Windows:** Install a version of the
|
||||||
|
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||||
|
or
|
||||||
|
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
||||||
|
that matches the version that was used to compile your Python interpreter.
|
||||||
|
|
||||||
|
### Building an executable {#executable}
|
||||||
|
|
||||||
|
The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that
|
||||||
|
builds an executable zip file using [`pex`](https://github.com/pantsbuild/pex)
|
||||||
|
(**P**ython **Ex**ecutable). The executable includes spaCy and all its package
|
||||||
|
dependencies and only requires the system Python at runtime. Building an
|
||||||
|
executable `.pex` file is often the most convenient way to deploy spaCy, as it
|
||||||
|
lets you separate the build from the deployment process.
|
||||||
|
|
||||||
|
> #### Usage
|
||||||
|
>
|
||||||
|
> To use a `.pex` file, just replace `python` with the path to the file when you
|
||||||
|
> execute your code or CLI commands. This is equivalent to running Python in a
|
||||||
|
> virtual environment with spaCy installed.
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ ./spacy.pex my_script.py
|
||||||
|
> $ ./spacy.pex -m spacy info
|
||||||
|
> ```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ sudo apt-get install build-essential python-dev git
|
$ git clone https://github.com/explosion/spaCy
|
||||||
|
$ cd spaCy
|
||||||
|
$ make
|
||||||
```
|
```
|
||||||
|
|
||||||
#### macOS / OS X {#source-osx}
|
You can configure the build process with the following environment variables:
|
||||||
|
|
||||||
Install a recent version of [XCode](https://developer.apple.com/xcode/),
|
| Variable | Description |
|
||||||
including the so-called "Command Line Tools". macOS and OS X ship with Python
|
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
and git preinstalled.
|
| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
|
||||||
|
| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`. |
|
||||||
#### Windows {#source-windows}
|
| `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. |
|
||||||
|
|
||||||
Install a version of the
|
|
||||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
|
||||||
or
|
|
||||||
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
|
|
||||||
that matches the version that was used to compile your Python interpreter.
|
|
||||||
|
|
||||||
### Run tests {#run-tests}
|
### Run tests {#run-tests}
|
||||||
|
|
||||||
|
|
|
@ -346,6 +346,8 @@ A pattern added to the dependency matcher consists of a **list of
|
||||||
dictionaries**, with each dictionary describing a **token to match** and its
|
dictionaries**, with each dictionary describing a **token to match** and its
|
||||||
**relation to an existing token** in the pattern.
|
**relation to an existing token** in the pattern.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
- **Usage:**
|
- **Usage:**
|
||||||
|
@ -408,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||||
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||||
|
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
|
||||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||||
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||||
|
@ -538,6 +541,9 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
doesn't provide lemmas by default or switch automatically between lookup and
|
doesn't provide lemmas by default or switch automatically between lookup and
|
||||||
rule-based lemmas. You can now add it to your pipeline explicitly and set its
|
rule-based lemmas. You can now add it to your pipeline explicitly and set its
|
||||||
mode on initialization.
|
mode on initialization.
|
||||||
|
- Various keyword arguments across functions and methods are now explicitly
|
||||||
|
declared as _keyword-only_ arguments. Those arguments are documented
|
||||||
|
accordingly across the API reference.
|
||||||
|
|
||||||
### Removed or renamed API {#incompat-removed}
|
### Removed or renamed API {#incompat-removed}
|
||||||
|
|
||||||
|
@ -704,6 +710,48 @@ nlp = spacy.blank("en")
|
||||||
+ nlp.add_pipe("ner", source=source_nlp)
|
+ nlp.add_pipe("ner", source=source_nlp)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Configuring pipeline components with settings {#migrating-configure-pipe}
|
||||||
|
|
||||||
|
Because pipeline components are now added using their string names, you won't
|
||||||
|
have to instantiate the [component classes](/api/#architecture-pipeline)
|
||||||
|
directly anynore. To configure the component, you can now use the `config`
|
||||||
|
argument on [`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.sentencizer]
|
||||||
|
> factory = "sentencizer"
|
||||||
|
> punct_chars = ["!", ".", "?"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```diff
|
||||||
|
punct_chars = ["!", ".", "?"]
|
||||||
|
- sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||||
|
+ sentencizer = nlp.add_pipe("sentencizer", config={"punct_chars": punct_chars})
|
||||||
|
```
|
||||||
|
|
||||||
|
The `config` corresponds to the component settings in the
|
||||||
|
[`config.cfg`](/usage/training#config-components) and will overwrite the default
|
||||||
|
config defined by the components.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Important note on config values">
|
||||||
|
|
||||||
|
Config values you pass to components **need to be JSON-serializable** and can't
|
||||||
|
be arbitrary Python objects. Otherwise, the settings you provide can't be
|
||||||
|
represented in the `config.cfg` and spaCy has no way of knowing how to re-create
|
||||||
|
your component with the same settings when you load the pipeline back in. If you
|
||||||
|
need to pass arbitrary objects to a component, use a
|
||||||
|
[registered function](/usage/processing-pipelines#example-stateful-components):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- config = {"model": MyTaggerModel()}
|
||||||
|
+ config= {"model": {"@architectures": "MyTaggerModel"}}
|
||||||
|
tagger = nlp.add_pipe("tagger", config=config)
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Adding match patterns {#migrating-matcher}
|
### Adding match patterns {#migrating-matcher}
|
||||||
|
|
||||||
The [`Matcher.add`](/api/matcher#add),
|
The [`Matcher.add`](/api/matcher#add),
|
||||||
|
@ -758,6 +806,25 @@ nlp = spacy.blank("en")
|
||||||
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Migrating Doc flags {#migrating-doc-flags}
|
||||||
|
|
||||||
|
The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
|
||||||
|
`Doc.is_sentenced` are deprecated in v3 and replaced by
|
||||||
|
[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
|
||||||
|
token attribute symbols (the same symbols used in `Matcher` patterns):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
doc = nlp(text)
|
||||||
|
- doc.is_parsed
|
||||||
|
+ doc.has_annotation("DEP")
|
||||||
|
- doc.is_tagged
|
||||||
|
+ doc.has_annotation("TAG")
|
||||||
|
- doc.is_sentenced
|
||||||
|
+ doc.has_annotation("SENT_START")
|
||||||
|
- doc.is_nered
|
||||||
|
+ doc.has_annotation("ENT_IOB")
|
||||||
|
```
|
||||||
|
|
||||||
### Training pipelines and models {#migrating-training}
|
### Training pipelines and models {#migrating-training}
|
||||||
|
|
||||||
To train your pipelines, you should now pretty much always use the
|
To train your pipelines, you should now pretty much always use the
|
||||||
|
|
Loading…
Reference in New Issue
Block a user