mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'develop-proof' of https://github.com/walterhenry/spaCy into develop-proof
This commit is contained in:
commit
c1c841940c
|
@ -224,7 +224,7 @@ for that particular code. Here's an example:
|
|||
```python
|
||||
# fmt: off
|
||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||
heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
|
||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||
"poss", "nsubj", "ccomp", "punct"]
|
||||
|
@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
|
|||
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
||||
you're not actually testing the model performance. If all you need is a `Doc`
|
||||
object with annotations like heads, POS tags or the dependency parse, you can
|
||||
use the `get_doc()` utility function to construct it manually.
|
||||
use the `Doc` constructor to construct it manually.
|
||||
|
||||
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||
|
||||
|
|
28
README.md
28
README.md
|
@ -8,12 +8,12 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and vectors, and
|
||||
currently supports tokenization for **59+ languages**. It features
|
||||
currently supports tokenization for **60+ languages**. It features
|
||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
|
||||
spaCy is commercial open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 2.3 out now!**
|
||||
💫 **Version 3.0 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
|
|||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| --------------- | -------------------------------------------------------------- |
|
||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||
| [Usage Guides] | How to use spaCy and its features. |
|
||||
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
||||
| [API Reference] | The detailed reference for spaCy's API. |
|
||||
| [Models] | Download statistical language models for spaCy. |
|
||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||
| [Changelog] | Changes and version history. |
|
||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||
| Documentation | |
|
||||
| ------------------- | -------------------------------------------------------------- |
|
||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||
| [Usage Guides] | How to use spaCy and its features. |
|
||||
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
||||
| [Project Templates] | End-to-end workflows you can clone, modify and run. |
|
||||
| [API Reference] | The detailed reference for spaCy's API. |
|
||||
| [Models] | Download statistical language models for spaCy. |
|
||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||
| [Changelog] | Changes and version history. |
|
||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
|
|||
[api reference]: https://spacy.io/api/
|
||||
[models]: https://spacy.io/models
|
||||
[universe]: https://spacy.io/universe
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
@ -69,7 +71,7 @@ it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **59+ languages**
|
||||
- Support for **60+ languages**
|
||||
- **Trained pipelines**
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Pretrained **word vectors**
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
raw = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
seed = 0
|
||||
use_pytorch_for_gpu_memory = false
|
||||
|
||||
[training]
|
||||
seed = ${system:seed}
|
||||
dropout = 0.1
|
||||
init_tok2vec = ${paths:init_tok2vec}
|
||||
vectors = null
|
||||
accumulate_gradient = 1
|
||||
max_steps = 0
|
||||
max_epochs = 0
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
|
||||
frozen_components = []
|
||||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
limit = 0
|
||||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
gold_preproc = ${training.read_train:gold_preproc}
|
||||
max_length = 0
|
||||
limit = 0
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
[training.batcher.size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
load_vocab_data = false
|
||||
pipeline = ["tok2vec", "ner", "tagger", "parser"]
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[components]
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 30
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
rows = 2000
|
||||
also_embed_subwords = true
|
||||
also_use_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
|
@ -1,152 +0,0 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
use_gpu = -1
|
||||
raw_text = null
|
||||
tag_map = null
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${training:seed}
|
||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
n_characters = 4
|
||||
|
||||
[pretraining.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
base_model = null
|
||||
|
||||
[nlp.pipeline]
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
|
@ -1,73 +0,0 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 3000
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 100000
|
||||
max_epochs = 0
|
||||
max_steps = 0
|
||||
eval_frequency = 1000
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1.0}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = ${training:dropout}
|
|
@ -1,73 +0,0 @@
|
|||
[training]
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
subword_features = true
|
||||
maxout_pieces = 3
|
||||
dropout = null
|
|
@ -1,110 +0,0 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
raw = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
seed = 0
|
||||
use_pytorch_for_gpu_memory = false
|
||||
|
||||
[training]
|
||||
seed = ${system:seed}
|
||||
dropout = 0.2
|
||||
init_tok2vec = ${paths:init_tok2vec}
|
||||
vectors = null
|
||||
accumulate_gradient = 1
|
||||
max_steps = 0
|
||||
max_epochs = 0
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
|
||||
|
||||
[training.read_train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
limit = 0
|
||||
|
||||
[training.read_dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
gold_preproc = ${training.read_train:gold_preproc}
|
||||
max_length = 0
|
||||
limit = 0
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
[training.batcher.size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "tagger", "parser"]
|
||||
load_vocab_data = false
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[components]
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
rows = 2000
|
||||
also_embed_subwords = true
|
||||
also_use_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
|
@ -1,69 +0,0 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
batch_size = 25
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract]
|
||||
@architectures = "spacy.CharacterEmbed.v1"
|
||||
width = 96
|
||||
nM = 64
|
||||
nC = 8
|
||||
rows = 2000
|
||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
dropout = null
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract.features]
|
||||
@architectures = "spacy.Doc2Feats.v1"
|
||||
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
|
||||
|
||||
[nlp.pipeline.tok2vec.model.embed]
|
||||
@architectures = "spacy.LayerNormalizedMaxout.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
maxout_pieces = 4
|
||||
|
||||
[nlp.pipeline.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
depth = 2
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
|
@ -1,51 +0,0 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 3000
|
||||
stop = 3000
|
||||
compound = 1.001
|
||||
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
width = 128
|
||||
depth = 4
|
||||
embed_size = 7000
|
||||
maxout_pieces = 3
|
||||
window_size = 1
|
||||
subword_features = true
|
||||
pretrained_vectors = null
|
||||
dropout = null
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a31,<8.0.0a40",
|
||||
"thinc>=8.0.0a34,<8.0.0a40",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a31,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
@ -20,6 +20,7 @@ pytokenizations
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a31,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a31,<8.0.0a40
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
@ -57,6 +57,7 @@ install_requires =
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a18"
|
||||
__version__ = "3.0.0a23"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -6,15 +6,16 @@ from wasabi import msg
|
|||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
import subprocess
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.config import Config, ConfigValidationError
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -38,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
|
|||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -62,24 +64,41 @@ def setup_cli() -> None:
|
|||
command(prog_name=COMMAND)
|
||||
|
||||
|
||||
def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||
def parse_config_overrides(
|
||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
"training.batch_size". Arguments without a "." are considered invalid,
|
||||
since the config only allows top-level sections to exist.
|
||||
|
||||
args (List[str]): The extra arguments from the command line.
|
||||
env_vars (Optional[str]): Optional environment variable to read from.
|
||||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||
"""
|
||||
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||
result = {}
|
||||
while args:
|
||||
opt = args.pop(0)
|
||||
err = f"Invalid CLI argument '{opt}'"
|
||||
err = f"Invalid config override '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
orig_opt = opt
|
||||
opt = opt.replace("--", "")
|
||||
if "." not in opt:
|
||||
raise NoSuchOption(orig_opt)
|
||||
if is_cli:
|
||||
raise NoSuchOption(orig_opt)
|
||||
else:
|
||||
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||
if "=" in opt: # we have --opt=value
|
||||
opt, value = opt.split("=", 1)
|
||||
opt = opt.replace("-", "_")
|
||||
|
@ -98,7 +117,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
else:
|
||||
msg.fail(f"{err}: override option should start with --", exits=1)
|
||||
msg.fail(f"{err}: name should start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
|
@ -287,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb") as input_file:
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
@ -308,6 +327,31 @@ def git_checkout(
|
|||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
raise IOError("Parent of destination of checkout must exist")
|
||||
|
||||
if sparse and git_version >= (2, 22):
|
||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||
elif sparse:
|
||||
# Only show warnings if the user explicitly wants sparse checkout but
|
||||
# the Git version doesn't support it
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||
# We're using Git, partial clone and sparse checkout to
|
||||
# only clone the files we need
|
||||
# This ends up being RIDICULOUS. omg.
|
||||
|
@ -324,47 +368,31 @@ def git_checkout(
|
|||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
supports_sparse = git_version >= (2, 22)
|
||||
use_sparse = supports_sparse and sparse
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
|
||||
if use_sparse:
|
||||
cmd += f"--filter=blob:none" # <-- The key bit
|
||||
# Only show warnings if the user explicitly wants sparse checkout but
|
||||
# the Git version doesn't support it
|
||||
elif sparse:
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
try_run_command(cmd)
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
f"-b {branch} --filter=blob:none"
|
||||
)
|
||||
run_command(cmd)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
|
||||
ret = try_run_command(cmd)
|
||||
git_repo = _from_http_to_git(repo)
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
git_repo = _http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
if use_sparse and not missings:
|
||||
if not missings:
|
||||
err = (
|
||||
f"Could not find any relevant files for '{subpath}'. "
|
||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||
f"and branch {branch}?"
|
||||
)
|
||||
msg.fail(err, exits=1)
|
||||
if use_sparse:
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||
try_run_command(cmd)
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||
run_command(cmd, capture=True)
|
||||
# And finally, we can checkout our subpath
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
try_run_command(cmd)
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
@ -378,7 +406,7 @@ def get_git_version(
|
|||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||
(0, 0) if the version couldn't be determined.
|
||||
"""
|
||||
ret = try_run_command(["git", "--version"], error=error)
|
||||
ret = run_command("git --version", capture=True)
|
||||
stdout = ret.stdout.strip()
|
||||
if not stdout or not stdout.startswith("git version"):
|
||||
return (0, 0)
|
||||
|
@ -386,24 +414,7 @@ def get_git_version(
|
|||
return (int(version[0]), int(version[1]))
|
||||
|
||||
|
||||
def try_run_command(
|
||||
cmd: Union[str, List[str]], error: str = "Could not run command"
|
||||
) -> subprocess.CompletedProcess:
|
||||
"""Try running a command and raise an error if it fails.
|
||||
|
||||
cmd (Union[str, List[str]]): The command to run.
|
||||
error (str): The error message.
|
||||
RETURNS (CompletedProcess): The completed process if the command ran.
|
||||
"""
|
||||
try:
|
||||
return run_command(cmd, capture=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
msg.fail(error)
|
||||
print(cmd)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _from_http_to_git(repo: str) -> str:
|
||||
def _http_to_git(repo: str) -> str:
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
if repo.startswith(r"https://"):
|
||||
|
|
|
@ -9,7 +9,7 @@ import sys
|
|||
from ._util import app, Arg, Opt
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
|
|||
# imported from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
"conllubio": conllu2docs,
|
||||
"conllu": conllu2docs,
|
||||
"conll": conllu2docs,
|
||||
"ner": conll_ner2docs,
|
||||
"iob": iob2docs,
|
||||
"json": json2docs,
|
||||
"conllubio": conllu_to_docs,
|
||||
"conllu": conllu_to_docs,
|
||||
"conll": conllu_to_docs,
|
||||
"ner": conll_ner_to_docs,
|
||||
"iob": iob_to_docs,
|
||||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
|
|||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
from thinc.config import VARIABLE_RE, ConfigValidationError
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
|
@ -51,7 +51,10 @@ def debug_config(
|
|||
msg.divider("Config validation")
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp, _ = util.load_model_from_config(config)
|
||||
nlp, resolved = util.load_model_from_config(config)
|
||||
# Use the resolved config here in case user has one function returning
|
||||
# a dict of corpora etc.
|
||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
||||
msg.good("Config is valid")
|
||||
if show_vars:
|
||||
variables = get_variables(config)
|
||||
|
@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
|||
value = util.dot_to_object(config, path)
|
||||
result[variable] = repr(value)
|
||||
return result
|
||||
|
||||
|
||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||
"""Validate fields in the config that refer to other sections or values
|
||||
(e.g. in the corpora) and make sure that those references exist.
|
||||
"""
|
||||
errors = []
|
||||
for field in fields:
|
||||
# If the field doesn't exist in the config, we ignore it
|
||||
try:
|
||||
value = util.dot_to_object(config, field)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
util.dot_to_object(config, value)
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {value}"
|
||||
errors.append({"loc": field.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config, errors)
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
from typing import Dict, Any, Optional
|
||||
import warnings
|
||||
from typing import Dict, Any, Optional, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import dot_to_object
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import Model, data_validation
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
|
@ -53,24 +57,30 @@ def debug_model_cli(
|
|||
}
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=config_overrides)
|
||||
nlp, config = util.load_model_from_config(config_path)
|
||||
config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=True
|
||||
)
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
seed = config["training"]["seed"]
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
pipe = nlp.get_pipe(component)
|
||||
if hasattr(pipe, "model"):
|
||||
model = pipe.model
|
||||
else:
|
||||
if not hasattr(pipe, "model"):
|
||||
msg.fail(
|
||||
f"The component '{component}' does not specify an object that holds a Model.",
|
||||
exits=1,
|
||||
)
|
||||
debug_model(model, print_settings=print_settings)
|
||||
model = pipe.model
|
||||
debug_model(config, nlp, model, print_settings=print_settings)
|
||||
|
||||
|
||||
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
|
||||
def debug_model(
|
||||
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
if not isinstance(model, Model):
|
||||
msg.fail(
|
||||
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||
|
@ -87,10 +97,23 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
|
||||
# STEP 1: Initializing the model and printing again
|
||||
X = _get_docs()
|
||||
Y = _get_output(model.ops.xp)
|
||||
# The output vector might differ from the official type of the output layer
|
||||
with data_validation(False):
|
||||
model.initialize(X=X, Y=Y)
|
||||
try:
|
||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
||||
nlp.begin_training(lambda: train_corpus(nlp))
|
||||
msg.info("Initialized the model with the training corpus.")
|
||||
except ValueError:
|
||||
try:
|
||||
_set_output_dim(nO=7, model=model)
|
||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
msg.info("Initialized the model with dummy data.")
|
||||
except:
|
||||
msg.fail(
|
||||
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
if print_settings.get("print_after_init"):
|
||||
msg.divider(f"STEP 1 - after initialization")
|
||||
_print_model(model, print_settings)
|
||||
|
@ -98,9 +121,18 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
# STEP 2: Updating the model and printing again
|
||||
optimizer = Adam(0.001)
|
||||
set_dropout_rate(model, 0.2)
|
||||
# ugly hack to deal with Tok2Vec listeners
|
||||
tok2vec = None
|
||||
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
|
||||
tok2vec = nlp.get_pipe("tok2vec")
|
||||
goldY = None
|
||||
for e in range(3):
|
||||
Y, get_dX = model.begin_update(_get_docs())
|
||||
dY = get_gradient(model, Y)
|
||||
if tok2vec:
|
||||
tok2vec.update([Example.from_dict(x, {}) for x in X])
|
||||
Y, get_dX = model.begin_update(X)
|
||||
if goldY is None:
|
||||
goldY = _simulate_gold(Y)
|
||||
dY = get_gradient(goldY, Y, model.ops)
|
||||
get_dX(dY)
|
||||
model.finish_update(optimizer)
|
||||
if print_settings.get("print_after_training"):
|
||||
|
@ -108,15 +140,25 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 3: the final prediction
|
||||
prediction = model.predict(_get_docs())
|
||||
prediction = model.predict(X)
|
||||
if print_settings.get("print_prediction"):
|
||||
msg.divider(f"STEP 3 - prediction")
|
||||
msg.info(str(prediction))
|
||||
|
||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||
|
||||
def get_gradient(model, Y):
|
||||
goldY = _get_output(model.ops.xp)
|
||||
return Y - goldY
|
||||
|
||||
def get_gradient(goldY, Y, ops):
|
||||
return ops.asarray(Y) - ops.asarray(goldY)
|
||||
|
||||
|
||||
def _simulate_gold(element, counter=1):
|
||||
if isinstance(element, Iterable):
|
||||
for i in range(len(element)):
|
||||
element[i] = _simulate_gold(element[i], counter + i)
|
||||
return element
|
||||
else:
|
||||
return 1 / counter
|
||||
|
||||
|
||||
def _sentences():
|
||||
|
@ -133,8 +175,13 @@ def _get_docs(lang: str = "en"):
|
|||
return list(nlp.pipe(_sentences()))
|
||||
|
||||
|
||||
def _get_output(xp):
|
||||
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
|
||||
def _set_output_dim(model, nO):
|
||||
# simulating dim inference by directly setting the nO argument of the model
|
||||
if model.has_dim("nO") is None:
|
||||
model.set_dim("nO", nO)
|
||||
if model.has_ref("output_layer"):
|
||||
if model.get_ref("output_layer").has_dim("nO") is None:
|
||||
model.get_ref("output_layer").set_dim("nO", nO)
|
||||
|
||||
|
||||
def _print_model(model, print_settings):
|
||||
|
|
|
@ -30,12 +30,13 @@ def init_config_cli(
|
|||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Generate a starter config.cfg for training. Based on your requirements
|
||||
specified via the CLI arguments, this command generates a config with the
|
||||
optimal settings for you use case. This includes the choice of architecture,
|
||||
optimal settings for your use case. This includes the choice of architecture,
|
||||
pretrained weights and related hyperparameters.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||
|
@ -43,7 +44,14 @@ def init_config_cli(
|
|||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||
optimize = optimize.value
|
||||
pipeline = string_to_list(pipeline)
|
||||
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
||||
init_config(
|
||||
output_file,
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize,
|
||||
cpu=cpu,
|
||||
pretraining=pretraining,
|
||||
)
|
||||
|
||||
|
||||
@init_cli.command("fill-config")
|
||||
|
@ -51,7 +59,7 @@ def init_fill_config_cli(
|
|||
# fmt: off
|
||||
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
||||
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -109,7 +117,13 @@ def fill_config(
|
|||
|
||||
|
||||
def init_config(
|
||||
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
||||
output_file: Path,
|
||||
*,
|
||||
lang: str,
|
||||
pipeline: List[str],
|
||||
optimize: str,
|
||||
cpu: bool,
|
||||
pretraining: bool = False,
|
||||
) -> None:
|
||||
is_stdout = str(output_file) == "-"
|
||||
msg = Printer(no_print=is_stdout)
|
||||
|
@ -156,8 +170,13 @@ def init_config(
|
|||
with show_validation_error(hint_fill=False):
|
||||
config = util.load_config_from_str(base_template)
|
||||
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||
config = nlp.config
|
||||
if pretraining:
|
||||
validate_config_for_pretrain(config, msg)
|
||||
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||
config = pretrain_config.merge(config)
|
||||
msg.good("Auto-filled config with all values")
|
||||
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||
save_config(config, output_file, is_stdout=is_stdout)
|
||||
|
||||
|
||||
def save_config(
|
||||
|
|
|
@ -110,7 +110,7 @@ def package(
|
|||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||
|
||||
|
|
|
@ -4,10 +4,9 @@ import time
|
|||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
from thinc.api import require_gpu, set_gpu_allocator
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from thinc.api import Config, CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from functools import partial
|
||||
|
@ -20,6 +19,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
|||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from .. import util
|
||||
from ..util import dot_to_object
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -31,7 +31,7 @@ def pretrain_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
|
@ -70,9 +70,7 @@ def pretrain_cli(
|
|||
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
overrides=config_overrides,
|
||||
interpolate=True
|
||||
config_path, overrides=config_overrides, interpolate=True
|
||||
)
|
||||
if not config.get("pretraining"):
|
||||
# TODO: What's the solution here? How do we handle optional blocks?
|
||||
|
@ -83,7 +81,7 @@ def pretrain_cli(
|
|||
|
||||
config.to_disk(output_dir / "config.cfg")
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
|
||||
pretrain(
|
||||
config,
|
||||
output_dir,
|
||||
|
@ -98,15 +96,17 @@ def pretrain(
|
|||
output_dir: Path,
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int=-1
|
||||
use_gpu: int = -1,
|
||||
):
|
||||
if config["system"].get("seed") is not None:
|
||||
fix_random_seed(config["system"]["seed"])
|
||||
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
|
||||
use_pytorch_for_gpu_memory()
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
P_cfg = config["pretraining"]
|
||||
corpus = P_cfg["corpus"]
|
||||
corpus = dot_to_object(config, P_cfg["corpus"])
|
||||
batcher = P_cfg["batcher"]
|
||||
model = create_pretraining_model(nlp, config["pretraining"])
|
||||
optimizer = config["pretraining"]["optimizer"]
|
||||
|
@ -147,9 +147,7 @@ def pretrain(
|
|||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P_cfg["n_save_every"] and (
|
||||
batch_id % P_cfg["n_save_every"] == 0
|
||||
):
|
||||
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
|
|
|
@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
|||
branch=asset["git"].get("branch"),
|
||||
sparse=sparse_checkout,
|
||||
)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
else:
|
||||
url = asset.get("url")
|
||||
if not url:
|
||||
|
|
|
@ -27,19 +27,32 @@ def project_pull_cli(
|
|||
|
||||
|
||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
||||
# set up. I guess see if it breaks first?
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
continue
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
||||
commands = list(config.get("commands", []))
|
||||
# We use a while loop here because we don't know how the commands
|
||||
# will be ordered. A command might need dependencies from one that's later
|
||||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
||||
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
# We remove the command from the list here, and break, so that
|
||||
# we iterate over the loop again.
|
||||
commands.remove(i)
|
||||
break
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -59,8 +59,9 @@ def project_run(
|
|||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd)
|
||||
if not rerun and not force:
|
||||
|
@ -144,7 +145,7 @@ def run_commands(
|
|||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command)
|
||||
run_command(command, capture=False)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
|
|
|
@ -8,7 +8,11 @@ train = ""
|
|||
dev = ""
|
||||
|
||||
[system]
|
||||
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
||||
{% if use_transformer -%}
|
||||
gpu_allocator = "pytorch"
|
||||
{% else -%}
|
||||
gpu_allocator = null
|
||||
{% endif %}
|
||||
|
||||
[nlp]
|
||||
lang = "{{ lang }}"
|
||||
|
@ -55,7 +59,8 @@ factory = "parser"
|
|||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
@ -75,7 +80,8 @@ factory = "ner"
|
|||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
|
@ -89,6 +95,49 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.entity_linker.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "textcat" in components %}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
nO = null
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{# NON-TRANSFORMER PIPELINE #}
|
||||
{% else -%}
|
||||
|
||||
|
@ -136,7 +185,8 @@ factory = "parser"
|
|||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
|
@ -153,7 +203,8 @@ factory = "ner"
|
|||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
@ -163,16 +214,68 @@ nO = null
|
|||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "textcat" in components %}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
nO = null
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "parser", "ner"] %}
|
||||
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
max_length = 0
|
||||
|
||||
[training]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
|
@ -181,12 +284,13 @@ vectors = "{{ word_vectors }}"
|
|||
{% endif -%}
|
||||
{% if use_transformer -%}
|
||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||
{% endif %}
|
||||
{% endif -%}
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
|
||||
|
||||
{% if use_transformer -%}
|
||||
[training.optimizer.learn_rate]
|
||||
@schedules = "warmup_linear.v1"
|
||||
|
@ -195,16 +299,6 @@ total_steps = 20000
|
|||
initial_rate = 5e-5
|
||||
{% endif %}
|
||||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
max_length = 0
|
||||
|
||||
{% if use_transformer %}
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_padded.v1"
|
||||
|
@ -223,18 +317,3 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
{% endif %}
|
||||
|
||||
[training.score_weights]
|
||||
{%- if "tagger" in components %}
|
||||
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
||||
{%- endif -%}
|
||||
{%- if "parser" in components %}
|
||||
dep_uas = 0.0
|
||||
dep_las = {{ (1.0 / components|length)|round(2) }}
|
||||
sents_f = 0.0
|
||||
{%- endif %}
|
||||
{%- if "ner" in components %}
|
||||
ents_f = {{ (1.0 / components|length)|round(2) }}
|
||||
ents_p = 0.0
|
||||
ents_r = 0.0
|
||||
{%- endif -%}
|
||||
|
|
|
@ -6,8 +6,7 @@ from pathlib import Path
|
|||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||
from thinc.api import Config, Optimizer
|
||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
import typer
|
||||
import logging
|
||||
|
@ -18,6 +17,7 @@ from ..language import Language
|
|||
from .. import util
|
||||
from ..training.example import Example
|
||||
from ..errors import Errors
|
||||
from ..util import dot_to_object
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -28,7 +28,7 @@ def train_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||
|
@ -78,22 +78,23 @@ def train(
|
|||
config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=True
|
||||
)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||
if config["training"]["vectors"] is not None:
|
||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
T_cfg = config["training"]
|
||||
optimizer = T_cfg["optimizer"]
|
||||
train_corpus = T_cfg["train_corpus"]
|
||||
dev_corpus = T_cfg["dev_corpus"]
|
||||
train_corpus = dot_to_object(config, T_cfg["train_corpus"])
|
||||
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
|
||||
batcher = T_cfg["batcher"]
|
||||
train_logger = T_cfg["logger"]
|
||||
# Components that shouldn't be updated during training
|
||||
|
@ -151,7 +152,8 @@ def train(
|
|||
exclude=frozen_components,
|
||||
)
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
@ -162,7 +164,8 @@ def train(
|
|||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
@ -206,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int):
|
|||
def create_evaluation_callback(
|
||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
weights = {key: value for key, value in weights.items() if value is not None}
|
||||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# Calculate a weighted sum based on score_weights for the main score
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
for key, value in scores.items():
|
||||
if key in weights and not isinstance(value, (int, float)):
|
||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||
try:
|
||||
weighted_score = sum(
|
||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||
|
@ -365,7 +375,8 @@ def update_meta(
|
|||
) -> None:
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in training["score_weights"]:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
|
|
@ -22,6 +22,11 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8+
|
||||
from typing import Literal
|
||||
except ImportError:
|
||||
from typing_extensions import Literal # noqa: F401
|
||||
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
|
|
|
@ -6,13 +6,12 @@ init_tok2vec = null
|
|||
|
||||
[system]
|
||||
seed = 0
|
||||
use_pytorch_for_gpu_memory = false
|
||||
gpu_allocator = null
|
||||
|
||||
[nlp]
|
||||
lang = null
|
||||
pipeline = []
|
||||
disabled = []
|
||||
load_vocab_data = true
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -22,29 +21,10 @@ after_pipeline_creation = null
|
|||
|
||||
[components]
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
score_weights = {}
|
||||
# Names of pipeline components that shouldn't be updated during training
|
||||
frozen_components = []
|
||||
# Readers for corpora like dev and train.
|
||||
[corpora]
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
||||
[training.train_corpus]
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
|
@ -56,7 +36,7 @@ max_length = 0
|
|||
# Limitation on number of training examples
|
||||
limit = 0
|
||||
|
||||
[training.dev_corpus]
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
|
@ -68,6 +48,34 @@ max_length = 0
|
|||
# Limitation on number of training examples
|
||||
limit = 0
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
score_weights = {}
|
||||
# Names of pipeline components that shouldn't be updated during training
|
||||
frozen_components = []
|
||||
# Location in the config where the dev corpus is defined
|
||||
dev_corpus = "corpora.dev"
|
||||
# Location in the config where the train corpus is defined
|
||||
train_corpus = "corpora.train"
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
|
|
|
@ -4,6 +4,7 @@ dropout = 0.2
|
|||
n_save_every = null
|
||||
component = "tok2vec"
|
||||
layer = ""
|
||||
corpus = "corpora.pretrain"
|
||||
|
||||
[pretraining.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
|
@ -12,13 +13,6 @@ discard_oversize = false
|
|||
tolerance = 0.2
|
||||
get_length = null
|
||||
|
||||
[pretraining.corpus]
|
||||
@readers = "spacy.JsonlReader.v1"
|
||||
path = ${paths.raw}
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
limit = 0
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
n_characters = 4
|
||||
|
@ -33,3 +27,12 @@ grad_clip = 1.0
|
|||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.pretrain]
|
||||
@readers = "spacy.JsonlReader.v1"
|
||||
path = ${paths.raw}
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
limit = 0
|
||||
|
|
|
@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
warnings.warn(Warnings.W005)
|
||||
if options.get("collapse_phrases", False):
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -57,7 +57,10 @@ class Warnings:
|
|||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
"token.head values, you can probably ignore this warning. Consider "
|
||||
"using Doc(words, ..., heads=heads, deps=deps) instead.")
|
||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||
"be more efficient to split your training data into multiple "
|
||||
"smaller JSON files instead.")
|
||||
|
@ -66,7 +69,7 @@ class Warnings:
|
|||
"in problems with the vocab further on in the pipeline.")
|
||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||
"entities \"{entities}\". Use "
|
||||
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
|
||||
" to check the alignment. Misaligned entities ('-') will be "
|
||||
"ignored during training.")
|
||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||
|
@ -119,6 +122,8 @@ class Warnings:
|
|||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||
"call the {matcher} on each Doc object.")
|
||||
W107 = ("The property Doc.{prop} is deprecated. Use "
|
||||
"Doc.has_annotation(\"{attr}\") instead.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -192,11 +197,6 @@ class Errors:
|
|||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
||||
"boundaries implicitly, based on the tree structure. This means "
|
||||
"the HEAD attribute would potentially override the sentence "
|
||||
"boundaries set by SENT_START.")
|
||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||
"length {length}.")
|
||||
|
@ -397,8 +397,8 @@ class Errors:
|
|||
E154 = ("One of the attributes or values is not supported for token "
|
||||
"patterns. Please use the option validate=True with Matcher, "
|
||||
"PhraseMatcher, or EntityRuler for more details.")
|
||||
E155 = ("The pipeline needs to include a tagger in order to use "
|
||||
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
|
||||
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
||||
"instead of list(nlp.tokenizer.pipe()).")
|
||||
E156 = ("The pipeline needs to include a parser in order to use "
|
||||
|
@ -455,7 +455,7 @@ class Errors:
|
|||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E189 = ("Each argument to `get_doc` should be of equal length.")
|
||||
E189 = ("Each argument to Doc.__init__ should be of equal length.")
|
||||
E190 = ("Token head out of range in `Doc.from_array()` for token index "
|
||||
"'{index}' with value '{value}' (equivalent to relative head "
|
||||
"index: '{rel_head_index}'). The head indices should be relative "
|
||||
|
@ -480,6 +480,13 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
|
||||
"float or int but got: {score_type}. To exclude the score from the "
|
||||
"final score, set its weight to null in the [training.score_weights] "
|
||||
"section of your training config.")
|
||||
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
|
||||
E917 = ("Received invalid value {value} for 'state_type' in "
|
||||
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
|
||||
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
||||
"values are an instance of spacy.vocab.Vocab or True to create one"
|
||||
" (default).")
|
||||
|
@ -545,7 +552,8 @@ class Errors:
|
|||
E949 = ("Can only create an alignment when the texts are the same.")
|
||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
|
|
|
@ -140,7 +140,6 @@ cdef class KnowledgeBase:
|
|||
self._entries.push_back(entry)
|
||||
self._aliases_table.push_back(alias)
|
||||
|
||||
cpdef from_disk(self, loc)
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
||||
|
||||
|
||||
|
|
47
spacy/kb.pyx
47
spacy/kb.pyx
|
@ -9,7 +9,8 @@ from libcpp.vector cimport vector
|
|||
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from os import path
|
||||
|
||||
from spacy import util
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings
|
||||
|
@ -319,8 +320,14 @@ cdef class KnowledgeBase:
|
|||
return 0.0
|
||||
|
||||
|
||||
def to_disk(self, loc):
|
||||
cdef Writer writer = Writer(loc)
|
||||
def to_disk(self, path):
|
||||
path = util.ensure_path(path)
|
||||
if path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
if not path.parent.exists():
|
||||
path.parent.mkdir(parents=True)
|
||||
|
||||
cdef Writer writer = Writer(path)
|
||||
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
||||
|
||||
# dumping the entity vectors in their original order
|
||||
|
@ -359,7 +366,13 @@ cdef class KnowledgeBase:
|
|||
|
||||
writer.close()
|
||||
|
||||
cpdef from_disk(self, loc):
|
||||
def from_disk(self, path):
|
||||
path = util.ensure_path(path)
|
||||
if path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
|
||||
cdef hash_t entity_hash
|
||||
cdef hash_t alias_hash
|
||||
cdef int64_t entry_index
|
||||
|
@ -369,7 +382,7 @@ cdef class KnowledgeBase:
|
|||
cdef AliasC alias
|
||||
cdef float vector_element
|
||||
|
||||
cdef Reader reader = Reader(loc)
|
||||
cdef Reader reader = Reader(path)
|
||||
|
||||
# STEP 0: load header and initialize KB
|
||||
cdef int64_t nr_entities
|
||||
|
@ -450,16 +463,13 @@ cdef class KnowledgeBase:
|
|||
|
||||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if path.exists(loc):
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=loc))
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
fseek(self._fp, 0, 0)
|
||||
|
||||
def close(self):
|
||||
|
@ -496,14 +506,9 @@ cdef class Writer:
|
|||
|
||||
|
||||
cdef class Reader:
|
||||
def __init__(self, object loc):
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if not path.exists(loc):
|
||||
raise ValueError(Errors.E929.format(loc=loc))
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
|
@ -17,4 +21,21 @@ class Bengali(Language):
|
|||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
|
|
|
@ -30,7 +30,6 @@ class Greek(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
# Further improvement of the models will eliminate the need for this tag.
|
||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -29,7 +29,6 @@ class English(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
|
|||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
doc = doclike.doc
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
if not len(doc):
|
||||
return
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,21 @@ class Persian(Language):
|
|||
Defaults = PersianDefaults
|
||||
|
||||
|
||||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -19,7 +19,7 @@ def noun_chunks(doclike):
|
|||
]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
|
|
|
@ -33,7 +33,6 @@ class French(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,21 @@ class Norwegian(Language):
|
|||
Defaults = NorwegianDefaults
|
||||
|
||||
|
||||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -30,7 +30,6 @@ class Dutch(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -35,7 +35,6 @@ class Polish(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -25,7 +25,6 @@ class Russian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -22,4 +27,21 @@ class Swedish(Language):
|
|||
Defaults = SwedishDefaults
|
||||
|
||||
|
||||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -25,7 +25,6 @@ class Ukrainian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
|||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
|
||||
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
||||
import srsly
|
||||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
|
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
|
|||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
from .lookups import load_lookups
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
@ -148,12 +156,7 @@ class Language:
|
|||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -245,9 +248,12 @@ class Language:
|
|||
self._config["nlp"]["pipeline"] = list(self.component_names)
|
||||
self._config["nlp"]["disabled"] = list(self.disabled)
|
||||
self._config["components"] = pipeline
|
||||
if not self._config["training"].get("score_weights"):
|
||||
combined_score_weights = combine_score_weights(score_weights)
|
||||
self._config["training"]["score_weights"] = combined_score_weights
|
||||
# We're merging the existing score weights back into the combined
|
||||
# weights to make sure we're preserving custom settings in the config
|
||||
# but also reflect updates (e.g. new components added)
|
||||
prev_weights = self._config["training"].get("score_weights", {})
|
||||
combined_score_weights = combine_score_weights(score_weights, prev_weights)
|
||||
self._config["training"]["score_weights"] = combined_score_weights
|
||||
if not srsly.is_json_serializable(self._config):
|
||||
raise ValueError(Errors.E961.format(config=self._config))
|
||||
return self._config
|
||||
|
@ -409,7 +415,6 @@ class Language:
|
|||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
scores: Iterable[str] = SimpleFrozenList(),
|
||||
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
||||
func: Optional[Callable] = None,
|
||||
) -> Callable:
|
||||
|
@ -427,12 +432,11 @@ class Language:
|
|||
e.g. "token.ent_id". Used for pipeline analyis.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
Used for pipeline analysis.
|
||||
scores (Iterable[str]): All scores set by the component if it's trainable,
|
||||
e.g. ["ents_f", "ents_r", "ents_p"].
|
||||
default_score_weights (Dict[str, float]): The scores to report during
|
||||
training, and their default weight towards the final score used to
|
||||
select the best model. Weights should sum to 1.0 per component and
|
||||
will be combined and normalized for the whole pipeline.
|
||||
will be combined and normalized for the whole pipeline. If None,
|
||||
the score won't be shown in the logs or be weighted.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/language#factory
|
||||
|
@ -472,7 +476,7 @@ class Language:
|
|||
default_config=default_config,
|
||||
assigns=validate_attrs(assigns),
|
||||
requires=validate_attrs(requires),
|
||||
scores=scores,
|
||||
scores=list(default_score_weights.keys()),
|
||||
default_score_weights=default_score_weights,
|
||||
retokenizes=retokenizes,
|
||||
)
|
||||
|
@ -1448,10 +1452,15 @@ class Language:
|
|||
"""Register 'listeners' within pipeline components, to allow them to
|
||||
effectively share weights.
|
||||
"""
|
||||
# I had though, "Why do we do this inside the Language object? Shouldn't
|
||||
# it be the tok2vec/transformer/etc's job?
|
||||
# The problem is we need to do it during deserialization...And the
|
||||
# components don't receive the pipeline then. So this does have to be
|
||||
# here :(
|
||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||
if hasattr(proc1, "find_listeners"):
|
||||
for name2, proc2 in self.pipeline[i:]:
|
||||
if hasattr(proc2, "model"):
|
||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||
if isinstance(getattr(proc2, "model", None), Model):
|
||||
proc1.find_listeners(proc2.model)
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -17,7 +17,7 @@ from ..vocab cimport Vocab
|
|||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
|
@ -215,10 +215,15 @@ cdef class Matcher:
|
|||
else:
|
||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||
cdef Pool tmp_pool = Pool()
|
||||
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
||||
and not doc.is_tagged:
|
||||
raise ValueError(Errors.E155.format())
|
||||
if DEP in self._seen_attrs and not doc.is_parsed:
|
||||
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if POS in self._seen_attrs and not doc.has_annotation("POS"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E156.format())
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates)
|
||||
|
|
|
@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
|||
|
||||
import warnings
|
||||
|
||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.span cimport Span
|
||||
|
@ -184,12 +184,20 @@ cdef class PhraseMatcher:
|
|||
if len(doc) == 0:
|
||||
continue
|
||||
if isinstance(doc, Doc):
|
||||
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
|
||||
raise ValueError(Errors.E155.format())
|
||||
if self.attr == DEP and not doc.is_parsed:
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
if self.attr == TAG and not has_annotation[TAG]:
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if self.attr == POS and not has_annotation[POS]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if self.attr == MORPH and not has_annotation[MORPH]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if self.attr == LEMMA and not has_annotation[LEMMA]:
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if self.attr == DEP and not has_annotation[DEP]:
|
||||
raise ValueError(Errors.E156.format())
|
||||
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
||||
and self.attr not in (DEP, POS, TAG, LEMMA):
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
keyword = self._convert_to_array(doc)
|
||||
|
|
|
@ -2,6 +2,8 @@ from typing import Optional, List
|
|||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...errors import Errors
|
||||
from ...compat import Literal
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
|
@ -11,7 +13,8 @@ from ...tokens import Doc
|
|||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
nr_feature_tokens: int,
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
use_upper: bool = True,
|
||||
|
@ -40,20 +43,12 @@ def build_tb_parser_model(
|
|||
|
||||
tok2vec (Model[List[Doc], List[Floats2d]]):
|
||||
Subnetwork to map tokens into vector representations.
|
||||
nr_feature_tokens (int): The number of tokens in the context to use to
|
||||
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
|
||||
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
||||
feature sets are designed for the NER. The recommended feature sets are
|
||||
3 for NER, and 8 for the dependency parser.
|
||||
|
||||
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
||||
and extra_state_features: [True, False]. This would map into:
|
||||
|
||||
(deps, False): 8
|
||||
(deps, True): 13
|
||||
(ner, False): 3
|
||||
(ner, True): 6
|
||||
|
||||
state_type (str):
|
||||
String value denoting the type of parser model: "parser" or "ner"
|
||||
extra_state_tokens (bool): Whether or not to use additional tokens in the context
|
||||
to construct the state vector. Defaults to `False`, which means 3 and 8
|
||||
for the NER and parser respectively. When set to `True`, this would become 6
|
||||
feature sets (for the NER) or 13 (for the parser).
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
|
@ -68,8 +63,14 @@ def build_tb_parser_model(
|
|||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
"""
|
||||
if state_type == "parser":
|
||||
nr_feature_tokens = 13 if extra_state_tokens else 8
|
||||
elif state_type == "ner":
|
||||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||
else:
|
||||
raise ValueError(Errors.E917.format(value=state_type))
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
lower = PrecomputableAffine(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
|
|
|
@ -164,7 +164,9 @@ def MultiHashEmbed(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||
def CharacterEmbed(
|
||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
||||
):
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
each word, taken from the beginning and end of the word equally. Padding is
|
||||
|
@ -188,19 +190,40 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
|||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
are between 3 and 8, although it may depend on the length of words in the
|
||||
language.
|
||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
if also_use_static_vectors:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
ragged2list(),
|
||||
)
|
||||
with_array(
|
||||
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
),
|
||||
with_array(
|
||||
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -679,8 +679,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
st._sent[i].dep = self.root_label
|
||||
|
||||
def finalize_doc(self, Doc doc):
|
||||
doc.is_parsed = True
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
|
||||
def has_gold(self, Example eg, start=0, end=None):
|
||||
for word in eg.y[start:end]:
|
||||
|
|
|
@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
|
|||
new_head = _find_new_head(doc[i], head_label)
|
||||
doc.c[i].head = new_head.i - i
|
||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,8 @@ from ..training import validate_examples
|
|||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
|
@ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
},
|
||||
scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
"dep_las": 0.5,
|
||||
"dep_las_per_type": None,
|
||||
"sents_p": None,
|
||||
"sents_r": None,
|
||||
"sents_f": 0.0,
|
||||
},
|
||||
)
|
||||
def make_parser(
|
||||
nlp: Language,
|
||||
|
|
|
@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_entity_ruler(
|
||||
nlp: Language,
|
||||
|
|
|
@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||
"""
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
return doc
|
||||
with doc.retokenize() as retokenizer:
|
||||
for np in doc.noun_chunks:
|
||||
|
|
|
@ -21,7 +21,6 @@ from .. import util
|
|||
"lookups": None,
|
||||
"overwrite": False,
|
||||
},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -32,6 +32,7 @@ width = 128
|
|||
rows = 7000
|
||||
nM = 64
|
||||
nC = 8
|
||||
also_use_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
|
@ -48,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
||||
scores=["pos_acc", "morph_acc", "morph_per_feat"],
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
|
@ -203,8 +203,6 @@ class Morphologizer(Tagger):
|
|||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||
|
||||
doc.is_morphed = True
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
their predicted scores.
|
||||
|
@ -259,79 +257,3 @@ class Morphologizer(Tagger):
|
|||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: load_model(b),
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
try:
|
||||
self.model.from_bytes(file_.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
||||
"model": load_model,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -13,7 +13,8 @@ from ..training import validate_examples
|
|||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
|
@ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"update_with_oracle_cut_size": 100,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
)
|
||||
def make_ner(
|
||||
|
|
|
@ -15,7 +15,6 @@ from .. import util
|
|||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
default_config={"punct_chars": None},
|
||||
scores=["sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_sentencizer(
|
||||
|
|
|
@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"senter",
|
||||
assigns=["token.is_sent_start"],
|
||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
||||
scores=["sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_senter(nlp: Language, name: str, model: Model):
|
||||
|
@ -170,79 +169,3 @@ class SentenceRecognizer(Tagger):
|
|||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: load_model(b),
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
try:
|
||||
self.model.from_bytes(file_.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
||||
"model": load_model,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
||||
scores=["tag_acc"],
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
|
@ -168,7 +167,6 @@ class Tagger(Pipe):
|
|||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
|
|
|
@ -62,18 +62,17 @@ subword_features = true
|
|||
"positive_label": None,
|
||||
"model": DEFAULT_TEXTCAT_MODEL,
|
||||
},
|
||||
scores=[
|
||||
"cats_score",
|
||||
"cats_score_desc",
|
||||
"cats_p",
|
||||
"cats_r",
|
||||
"cats_f",
|
||||
"cats_macro_f",
|
||||
"cats_macro_auc",
|
||||
"cats_f_per_type",
|
||||
"cats_macro_auc_per_type",
|
||||
],
|
||||
default_score_weights={"cats_score": 1.0},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
"cats_p": None,
|
||||
"cats_r": None,
|
||||
"cats_f": None,
|
||||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_textcat(
|
||||
nlp: Language,
|
||||
|
@ -181,9 +180,9 @@ class TextCategorizer(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
||||
"""
|
||||
tensors = [doc.tensor for doc in docs]
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
tensors = [doc.tensor for doc in docs]
|
||||
xp = get_array_module(tensors)
|
||||
scores = xp.zeros((len(docs), len(self.labels)))
|
||||
return scores
|
||||
|
|
|
@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
|
|||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
listener.receive(batch_id, tokvecs, None)
|
||||
listener.receive(batch_id, tokvecs, lambda dX: [])
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
|
||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
|
@ -8,6 +8,7 @@ from collections import defaultdict
|
|||
from thinc.api import Optimizer
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -104,7 +105,7 @@ class TokenPatternOperator(str, Enum):
|
|||
StringValue = Union[TokenPatternString, StrictStr]
|
||||
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||
UnderscoreValue = Union[
|
||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
|
||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||
]
|
||||
|
||||
|
||||
|
@ -198,8 +199,9 @@ class ModelMetaSchema(BaseModel):
|
|||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
train_corpus: Reader = Field(..., title="Reader for the training data")
|
||||
dev_corpus: Reader = Field(..., title="Reader for the dev data")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
|
||||
|
@ -207,8 +209,9 @@ class ConfigSchemaTraining(BaseModel):
|
|||
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
|
||||
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
|
||||
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
|
@ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
||||
|
@ -249,11 +251,11 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
corpus: Reader = Field(..., title="Reader for the training data")
|
||||
corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
# fmt: on
|
||||
|
@ -268,6 +270,7 @@ class ConfigSchema(BaseModel):
|
|||
nlp: ConfigSchemaNlp
|
||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||
components: Dict[str, Dict[str, Any]]
|
||||
corpora: Dict[str, Reader]
|
||||
|
||||
@root_validator(allow_reuse=True)
|
||||
def validate_config(cls, values):
|
||||
|
|
|
@ -240,7 +240,7 @@ class Scorer:
|
|||
pred_per_feat[field].add((gold_i, feat))
|
||||
for field in per_feat:
|
||||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||
)
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {f"{attr}_per_feat": result}
|
||||
|
@ -270,6 +270,18 @@ class Scorer:
|
|||
for example in examples:
|
||||
pred_doc = example.predicted
|
||||
gold_doc = example.reference
|
||||
# TODO
|
||||
# This is a temporary hack to work around the problem that the scorer
|
||||
# fails if you have examples that are not fully annotated for all
|
||||
# the tasks in your pipeline. For instance, you might have a corpus
|
||||
# of NER annotations that does not set sentence boundaries, but the
|
||||
# pipeline includes a parser or senter, and then the score_weights
|
||||
# are used to evaluate that component. When the scorer attempts
|
||||
# to read the sentences from the gold document, it fails.
|
||||
try:
|
||||
list(getter(gold_doc, attr))
|
||||
except ValueError:
|
||||
continue
|
||||
# Find all labels in gold and doc
|
||||
labels = set(
|
||||
[k.label_ for k in getter(gold_doc, attr)]
|
||||
|
@ -406,9 +418,9 @@ class Scorer:
|
|||
f_per_type[pred_label].fp += 1
|
||||
micro_prf = PRFScore()
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp = label_prf.tp
|
||||
micro_prf.fn = label_prf.fn
|
||||
micro_prf.fp = label_prf.fp
|
||||
micro_prf.tp += label_prf.tp
|
||||
micro_prf.fn += label_prf.fn
|
||||
micro_prf.fp += label_prf.fp
|
||||
n_cats = len(f_per_type) + 1e-100
|
||||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||
|
|
|
@ -17,7 +17,6 @@ Tests for spaCy modules and classes live in their own directories of the same na
|
|||
5. [Helpers and utilities](#helpers-and-utilities)
|
||||
6. [Contributing to the tests](#contributing-to-the-tests)
|
||||
|
||||
|
||||
## Running the tests
|
||||
|
||||
To show print statements, run the tests with `py.test -s`. To abort after the
|
||||
|
@ -39,19 +38,17 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
|
|||
|
||||
## Dos and don'ts
|
||||
|
||||
To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||
|
||||
* **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
||||
* If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||
* Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
|
||||
* Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
|
||||
* If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
|
||||
* Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
|
||||
* **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
|
||||
* If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
|
||||
* Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
|
||||
* Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
|
||||
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||
|
||||
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
||||
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
|
||||
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
|
||||
- If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
|
||||
- Before requiring the models, always make sure there is no other way to test the particular behavior. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
|
||||
- **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
|
||||
- If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
|
||||
- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behavior at a time.
|
||||
|
||||
## Parameters
|
||||
|
||||
|
@ -64,7 +61,7 @@ def test_tokenizer_keep_urls(tokenizer, text):
|
|||
assert len(tokens) == 1
|
||||
```
|
||||
|
||||
This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
|
||||
This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
|
||||
|
||||
You can also specify parameters as tuples to test with multiple values per test:
|
||||
|
||||
|
@ -79,8 +76,7 @@ To test for combinations of parameters, you can add several `parametrize` marker
|
|||
@pytest.mark.parametrize('punct', ['.', '!', '?'])
|
||||
```
|
||||
|
||||
This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
|
||||
|
||||
This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unnecessary or undesired test bloat.
|
||||
|
||||
## Fixtures
|
||||
|
||||
|
@ -88,11 +84,11 @@ Fixtures to create instances of spaCy objects and other components should only b
|
|||
|
||||
These are the main fixtures that are currently available:
|
||||
|
||||
| Fixture | Description |
|
||||
| --- | --- |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||
| Fixture | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------- |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||
|
||||
The fixtures can be used in all tests by simply setting them as an argument, like this:
|
||||
|
||||
|
@ -107,59 +103,32 @@ If all tests in a file require a specific configuration, or use the same complex
|
|||
|
||||
Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
|
||||
|
||||
### Constructing a `Doc` object manually
|
||||
|
||||
### Constructing a `Doc` object manually with `get_doc()`
|
||||
|
||||
Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can use `get_doc()` to construct it manually.
|
||||
Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need is a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
|
||||
|
||||
```python
|
||||
def test_doc_token_api_strings(en_tokenizer):
|
||||
def test_doc_token_api_strings(en_vocab):
|
||||
text = "Give it back! He pleaded."
|
||||
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||
heads = [0, 0, 0, 0, 5, 5, 5]
|
||||
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[0].lower_ == 'give'
|
||||
assert doc[0].pos_ == 'VERB'
|
||||
assert doc[0].dep_ == 'ROOT'
|
||||
```
|
||||
|
||||
You can construct a `Doc` with the following arguments:
|
||||
|
||||
| Argument | Description |
|
||||
| --- | --- |
|
||||
| `vocab` | `Vocab` instance to use. If you're tokenizing before creating a `Doc`, make sure to use the tokenizer's vocab. Otherwise, you can also use the `en_vocab` fixture. **(required)** |
|
||||
| `words` | List of words, for example `[t.text for t in tokens]`. **(required)** |
|
||||
| `heads` | List of heads as integers. |
|
||||
| `pos` | List of POS tags as text values. |
|
||||
| `tag` | List of tag names as text values. |
|
||||
| `dep` | List of dependencies as text values. |
|
||||
| `ents` | List of entity tuples with `start`, `end`, `label` (for example `(0, 2, 'PERSON')`). The `label` will be looked up in `vocab.strings[label]`. |
|
||||
|
||||
Here's how to quickly get these values from within spaCy:
|
||||
|
||||
```python
|
||||
doc = nlp(u'Some text here')
|
||||
print([token.head.i-token.i for token in doc])
|
||||
print([token.tag_ for token in doc])
|
||||
print([token.pos_ for token in doc])
|
||||
print([token.dep_ for token in doc])
|
||||
print([(ent.start, ent.end, ent.label_) for ent in doc.ents])
|
||||
```
|
||||
|
||||
**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
|
||||
|
||||
### Other utilities
|
||||
|
||||
| Name | Description |
|
||||
| --- | --- |
|
||||
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
|
||||
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
|
||||
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
|
||||
| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
|
||||
| Name | Description |
|
||||
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
|
||||
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
|
||||
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
|
||||
| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
|
||||
|
||||
## Contributing to the tests
|
||||
|
||||
|
|
|
@ -59,6 +59,11 @@ def de_tokenizer():
|
|||
return get_lang_class("de")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def de_vocab():
|
||||
return get_lang_class("de")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def el_tokenizer():
|
||||
return get_lang_class("el")().tokenizer
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.training import Example
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.tokens import Span, Doc
|
||||
from spacy import registry
|
||||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
|
||||
|
||||
def _ner_example(ner):
|
||||
doc = Doc(
|
||||
|
@ -19,7 +17,7 @@ def _ner_example(ner):
|
|||
|
||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
|
@ -41,7 +39,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
def test_ents_reset(en_vocab):
|
||||
"""Ensure that resetting doc.ents does not change anything"""
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
|
@ -59,7 +57,7 @@ def test_ents_reset(en_vocab):
|
|||
|
||||
def test_add_overlapping_entities(en_vocab):
|
||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
entity = Span(doc, 0, 4, label=391)
|
||||
doc.ents = [entity]
|
||||
|
||||
|
|
|
@ -2,8 +2,6 @@ import pytest
|
|||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_array_attr_of_token(en_vocab):
|
||||
doc = Doc(en_vocab, words=["An", "example", "sentence"])
|
||||
|
@ -35,7 +33,7 @@ def test_doc_scalar_attr_of_token(en_vocab):
|
|||
def test_doc_array_tag(en_vocab):
|
||||
words = ["A", "nice", "sentence", "."]
|
||||
pos = ["DET", "ADJ", "NOUN", "PUNCT"]
|
||||
doc = get_doc(en_vocab, words=words, pos=pos)
|
||||
doc = Doc(en_vocab, words=words, pos=pos)
|
||||
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
||||
feats_array = doc.to_array((ORTH, POS))
|
||||
assert feats_array[0][1] == doc[0].pos
|
||||
|
@ -47,7 +45,7 @@ def test_doc_array_tag(en_vocab):
|
|||
def test_doc_array_morph(en_vocab):
|
||||
words = ["Eat", "blue", "ham"]
|
||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||
doc = get_doc(en_vocab, words=words, morphs=morph)
|
||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||
assert morph[0] == doc[0].morph_
|
||||
assert morph[1] == doc[1].morph_
|
||||
assert morph[2] == doc[2].morph_
|
||||
|
@ -61,7 +59,7 @@ def test_doc_array_morph(en_vocab):
|
|||
def test_doc_array_dep(en_vocab):
|
||||
words = ["A", "nice", "sentence", "."]
|
||||
deps = ["det", "amod", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, deps=deps)
|
||||
feats_array = doc.to_array((ORTH, DEP))
|
||||
assert feats_array[0][1] == doc[0].dep
|
||||
assert feats_array[1][1] == doc[1].dep
|
||||
|
|
|
@ -6,7 +6,22 @@ from spacy.lexeme import Lexeme
|
|||
from spacy.lang.en import English
|
||||
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
def test_doc_api_init(en_vocab):
|
||||
words = ["a", "b", "c", "d"]
|
||||
heads = [0, 0, 2, 2]
|
||||
# set sent_start by sent_starts
|
||||
doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
# set sent_start by heads
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
# heads override sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||
|
@ -106,6 +121,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
|||
tokens = en_tokenizer(text)
|
||||
tokens[0].lemma_ = "lemma"
|
||||
tokens[0].norm_ = "norm"
|
||||
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
|
||||
tokens[0].ent_kb_id_ = "ent_kb_id"
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||
assert tokens.text == new_tokens.text
|
||||
|
@ -144,7 +160,6 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
|
||||
def test_doc_api_sents_empty_string(en_tokenizer):
|
||||
doc = en_tokenizer("")
|
||||
doc.is_parsed = True
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 0
|
||||
|
||||
|
@ -158,7 +173,7 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
nps = []
|
||||
for np in doc.noun_chunks:
|
||||
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
||||
|
@ -175,16 +190,19 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
retokenizer.merge(np, attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_api_right_edge(en_tokenizer):
|
||||
def test_doc_api_right_edge(en_vocab):
|
||||
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||
# fmt: off
|
||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||
words = [
|
||||
"I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
|
||||
"of", "such", "as", "live", "under", "the", "government", "of", "the",
|
||||
"Romans", ",", "to", "translate", "those", "books", "into", "the",
|
||||
"Greek", "tongue", "."
|
||||
]
|
||||
heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[6].text == "for"
|
||||
subtree = [w.text for w in doc[6].subtree]
|
||||
# fmt: off
|
||||
|
@ -212,16 +230,16 @@ def test_doc_api_similarity_match():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence,heads,lca_matrix",
|
||||
"words,heads,lca_matrix",
|
||||
[
|
||||
(
|
||||
"the lazy dog slept",
|
||||
[2, 1, 1, 0],
|
||||
["the", "lazy", "dog", "slept"],
|
||||
[2, 2, 3, 3],
|
||||
numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
|
||||
),
|
||||
(
|
||||
"The lazy dog slept. The quick fox jumped",
|
||||
[2, 1, 1, 0, -1, 2, 1, 1, 0],
|
||||
["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
|
||||
[2, 2, 3, 3, 3, 7, 7, 8, 8],
|
||||
numpy.array(
|
||||
[
|
||||
[0, 2, 2, 3, 3, -1, -1, -1, -1],
|
||||
|
@ -238,9 +256,8 @@ def test_doc_api_similarity_match():
|
|||
),
|
||||
],
|
||||
)
|
||||
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
|
||||
doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
|
||||
lca = doc.get_lca_matrix()
|
||||
assert (lca == lca_matrix).all()
|
||||
assert lca[1, 1] == 1
|
||||
|
@ -251,67 +268,64 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
|||
def test_doc_is_nered(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
assert not doc.is_nered
|
||||
assert not doc.has_annotation("ENT_IOB")
|
||||
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
||||
assert doc.is_nered
|
||||
assert doc.has_annotation("ENT_IOB")
|
||||
# Test creating doc from array with unknown values
|
||||
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
||||
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
||||
assert doc.is_nered
|
||||
assert doc.has_annotation("ENT_IOB")
|
||||
# Test serialization
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.is_nered
|
||||
assert new_doc.has_annotation("ENT_IOB")
|
||||
|
||||
|
||||
def test_doc_from_array_sent_starts(en_vocab):
|
||||
# fmt: off
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||
# fmt: off
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||
doc[i].dep_ = dep
|
||||
doc[i].head = doc[head]
|
||||
if head == i:
|
||||
doc[i].is_sent_start = True
|
||||
doc.is_parsed
|
||||
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
# HEAD overrides SENT_START without warning
|
||||
attrs = [SENT_START, HEAD]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
new_doc.from_array(attrs, arr)
|
||||
# no warning using default attrs
|
||||
attrs = doc._get_array_attrs()
|
||||
arr = doc.to_array(attrs)
|
||||
with pytest.warns(None) as record:
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
attrs = [SENT_START, DEP]
|
||||
assert len(record) == 0
|
||||
# only SENT_START uses SENT_START
|
||||
attrs = [SENT_START]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||
assert not new_doc.is_parsed
|
||||
|
||||
assert not new_doc.has_annotation("DEP")
|
||||
# only HEAD uses HEAD
|
||||
attrs = [HEAD, DEP]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||
assert new_doc.is_parsed
|
||||
assert new_doc.has_annotation("DEP")
|
||||
|
||||
|
||||
def test_doc_from_array_morph(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", "."]
|
||||
# fmt: off
|
||||
words = ["I", "live", "in", "New", "York", "."]
|
||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, morph in enumerate(morphs):
|
||||
doc[i].morph_ = morph
|
||||
|
||||
attrs = [MORPH]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
assert [t.morph_ for t in new_doc] == morphs
|
||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||
|
||||
|
@ -323,15 +337,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
docs_idx = en_texts[0].index("docs")
|
||||
de_doc = de_tokenizer(de_text)
|
||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
|
||||
True,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
expected = (True, None, None, None)
|
||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
|
||||
assert Doc.from_docs([]) is None
|
||||
|
||||
assert de_doc is not Doc.from_docs([de_doc])
|
||||
assert str(de_doc) == str(Doc.from_docs([de_doc]))
|
||||
|
||||
|
@ -365,9 +373,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||
with pytest.raises(ValueError):
|
||||
# important attributes from sentenziser or parser are missing
|
||||
assert list(m_doc.sents)
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
# space delimiter considered, although spacy attribute was missing
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
|
@ -379,6 +384,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
|
||||
|
||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
docs = [en_tokenizer(t) for t in texts]
|
||||
docs[0].ents = ()
|
||||
docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
|
||||
doc = Doc.from_docs(docs)
|
||||
assert len(doc.ents) == 1
|
||||
|
||||
|
||||
def test_doc_lang(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
assert doc.lang_ == "en"
|
||||
|
@ -399,3 +413,58 @@ def test_token_lexeme(en_vocab):
|
|||
assert isinstance(token.lex, Lexeme)
|
||||
assert token.lex.text == token.text
|
||||
assert en_vocab[token.orth] == token.lex
|
||||
|
||||
|
||||
def test_has_annotation(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
doc.ents = [Span(doc, 0, 1, label="HELLO")]
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[1].tag_ = "A"
|
||||
doc[1].pos_ = "X"
|
||||
doc[1].morph_ = ""
|
||||
doc[1].lemma_ = "a"
|
||||
doc[1].dep_ = "dep"
|
||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_is_flags_deprecated(en_tokenizer):
|
||||
doc = en_tokenizer("test")
|
||||
with pytest.deprecated_call():
|
||||
doc.is_tagged
|
||||
with pytest.deprecated_call():
|
||||
doc.is_parsed
|
||||
with pytest.deprecated_call():
|
||||
doc.is_nered
|
||||
with pytest.deprecated_call():
|
||||
doc.is_sentenced
|
||||
|
||||
|
||||
def test_doc_set_ents():
|
||||
"""Test that both strings and integers can be used to set entities in
|
||||
tuple format via doc.ents."""
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
vocab = Vocab()
|
||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||
doc = Doc(vocab, words=words, ents=ents)
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
|
|
|
@ -3,8 +3,6 @@ from spacy.attrs import LEMMA
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc, Token
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_retokenize_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
|
@ -88,9 +86,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
|
|||
|
||||
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
heads = [1, 1, 0, -1]
|
||||
heads = [1, 2, 2, 2]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
|
@ -103,17 +101,12 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
|||
assert doc[0].ent_type_ == "GPE"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
||||
text = "The players start."
|
||||
heads = [1, 1, 0, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||
words = ["The", "players", "start", "."]
|
||||
heads = [1, 2, 2, 2]
|
||||
tags = ["DT", "NN", "VBZ", "."]
|
||||
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -124,13 +117,7 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -147,11 +134,10 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[1].pos_ == "VERB"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
|
||||
text = "I found a pilates class near work."
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
||||
words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
|
||||
heads = [1, 1, 4, 6, 1, 4, 5, 1]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert len(doc) == 8
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
|
||||
|
@ -182,9 +168,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
|
|||
|
||||
def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||
text = "displaCy is a parse tool built with Javascript"
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -1]
|
||||
heads = [1, 1, 4, 4, 1, 4, 5, 6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert doc[4].head.i == 1
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
|
||||
|
@ -192,18 +178,18 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
|||
assert doc[2].head.i == 1
|
||||
|
||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
|
||||
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for ent in doc.ents:
|
||||
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
|
||||
retokenizer.merge(ent, attrs=attrs)
|
||||
|
||||
text = "One test with entities like New York City so the ents list is not void"
|
||||
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
||||
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for ent in doc.ents:
|
||||
retokenizer.merge(ent)
|
||||
|
@ -212,12 +198,12 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
|||
def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||
ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
|
||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
||||
)
|
||||
assert len(doc) == 17
|
||||
|
@ -282,13 +268,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
|
||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
en_vocab.strings.add("dep")
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
@ -305,10 +291,10 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
|
||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
||||
deps = ["dep"] * len(words)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5])
|
||||
retokenizer.merge(doc[5:7])
|
||||
|
@ -322,13 +308,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||
heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
|
||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||
'compound', 'dobj', 'punct']
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
init_len = len(sent1)
|
||||
init_len2 = len(sent2)
|
||||
|
@ -343,13 +329,13 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
|||
def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
|
||||
deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
|
||||
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
|
||||
"dobj"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1 = list(doc.sents)[0]
|
||||
init_len = len(list(sent1.root.subtree))
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -2,13 +2,11 @@ import pytest
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc, Token
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_retokenize_split(en_vocab):
|
||||
words = ["LosAngeles", "start", "."]
|
||||
heads = [1, 1, 0]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||
heads = [1, 2, 2]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert len(doc) == 3
|
||||
assert len(str(doc)) == 19
|
||||
assert doc[0].head.text == "start"
|
||||
|
@ -88,11 +86,11 @@ def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
|
|||
# fmt: off
|
||||
words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
|
||||
"lives", "in", "England", "and", "loves", "JoePasquale", "."]
|
||||
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
||||
heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
|
||||
"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
init_len = len(sent1)
|
||||
init_len2 = len(sent2)
|
||||
|
|
|
@ -4,19 +4,17 @@ from spacy.tokens import Doc, Span
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.util import filter_spans
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -24,7 +22,6 @@ def doc_not_parsed(en_tokenizer):
|
|||
text = "This is a sentence. This is another sentence. And a third."
|
||||
tokens = en_tokenizer(text)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
doc.is_parsed = False
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -70,9 +67,10 @@ def test_spans_string_fn(doc):
|
|||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
heads = [0, 3, -1, -2, -4]
|
||||
heads = [0, 4, 1, 1, 0]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert doc[-2:].root.text == "Carolina"
|
||||
|
||||
|
||||
|
@ -92,7 +90,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
doc = Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[2, 2, 3, 3],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # the & the -> the
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -8,10 +7,10 @@ def doc(en_vocab):
|
|||
words = ["c", "d", "e"]
|
||||
pos = ["VERB", "NOUN", "NOUN"]
|
||||
tags = ["VBP", "NN", "NN"]
|
||||
heads = [0, -1, -2]
|
||||
heads = [0, 0, 0]
|
||||
deps = ["ROOT", "dobj", "dobj"]
|
||||
ents = [(1, 2, "ORG")]
|
||||
return get_doc(
|
||||
ents = [("ORG", 1, 2)]
|
||||
return Doc(
|
||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||
)
|
||||
|
||||
|
|
|
@ -5,31 +5,24 @@ from spacy.symbols import VERB
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
def doc(en_vocab):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 10, 12, 10, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_doc_token_api_strings(en_tokenizer):
|
||||
text = "Give it back! He pleaded."
|
||||
def test_doc_token_api_strings(en_vocab):
|
||||
words = ["Give", "it", "back", "!", "He", "pleaded", "."]
|
||||
pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
|
||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||
heads = [0, 0, 0, 0, 5, 5, 5]
|
||||
deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, heads=heads, deps=deps)
|
||||
assert doc[0].orth_ == "Give"
|
||||
assert doc[0].text == "Give"
|
||||
assert doc[0].text_with_ws == "Give "
|
||||
|
@ -97,77 +90,91 @@ def test_doc_token_api_vectors():
|
|||
assert doc[0].similarity(doc[1]) == cosine
|
||||
|
||||
|
||||
def test_doc_token_api_ancestors(en_tokenizer):
|
||||
def test_doc_token_api_ancestors(en_vocab):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
text = "Yesterday I saw a dog that barked loudly."
|
||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
||||
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
|
||||
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
||||
assert [t.text for t in doc[2].ancestors] == []
|
||||
|
||||
assert doc[2].is_ancestor(doc[7])
|
||||
assert not doc[6].is_ancestor(doc[2])
|
||||
|
||||
|
||||
def test_doc_token_api_head_setter(en_tokenizer):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
text = "Yesterday I saw a dog that barked loudly."
|
||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
|
||||
def test_doc_token_api_head_setter(en_vocab):
|
||||
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
||||
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[6].n_lefts == 1
|
||||
assert doc[6].n_rights == 1
|
||||
assert doc[6].left_edge.i == 5
|
||||
assert doc[6].right_edge.i == 7
|
||||
|
||||
assert doc[4].n_lefts == 1
|
||||
assert doc[4].n_rights == 1
|
||||
assert doc[4].left_edge.i == 3
|
||||
assert doc[4].right_edge.i == 7
|
||||
|
||||
assert doc[3].n_lefts == 0
|
||||
assert doc[3].n_rights == 0
|
||||
assert doc[3].left_edge.i == 3
|
||||
assert doc[3].right_edge.i == 3
|
||||
|
||||
assert doc[2].left_edge.i == 0
|
||||
assert doc[2].right_edge.i == 8
|
||||
|
||||
doc[6].head = doc[3]
|
||||
|
||||
assert doc[6].n_lefts == 1
|
||||
assert doc[6].n_rights == 1
|
||||
assert doc[6].left_edge.i == 5
|
||||
assert doc[6].right_edge.i == 7
|
||||
|
||||
assert doc[3].n_lefts == 0
|
||||
assert doc[3].n_rights == 1
|
||||
assert doc[3].left_edge.i == 3
|
||||
assert doc[3].right_edge.i == 7
|
||||
|
||||
assert doc[4].n_lefts == 1
|
||||
assert doc[4].n_rights == 0
|
||||
assert doc[4].left_edge.i == 3
|
||||
assert doc[4].right_edge.i == 7
|
||||
|
||||
assert doc[2].left_edge.i == 0
|
||||
assert doc[2].right_edge.i == 8
|
||||
|
||||
doc[0].head = doc[5]
|
||||
|
||||
assert doc[5].left_edge.i == 0
|
||||
assert doc[6].left_edge.i == 0
|
||||
assert doc[3].left_edge.i == 0
|
||||
assert doc[4].left_edge.i == 0
|
||||
assert doc[2].left_edge.i == 0
|
||||
|
||||
# head token must be from the same document
|
||||
doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc2 = Doc(en_vocab, words=words, heads=heads)
|
||||
with pytest.raises(ValueError):
|
||||
doc[0].head = doc2[0]
|
||||
# test sentence starts when two sentences are joined
|
||||
# fmt: off
|
||||
words = ["This", "is", "one", "sentence", ".", "This", "is", "another", "sentence", "."]
|
||||
heads = [0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||
# initially two sentences
|
||||
assert doc[0].is_sent_start
|
||||
assert doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
# modifying with a sentence doesn't change sent starts
|
||||
doc[2].head = doc[3]
|
||||
assert doc[0].is_sent_start
|
||||
assert doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
# attach the second sentence to the first, resulting in one sentence
|
||||
doc[5].head = doc[0]
|
||||
assert doc[0].is_sent_start
|
||||
assert not doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[9]
|
||||
|
||||
|
||||
def test_is_sent_start(en_tokenizer):
|
||||
|
@ -175,7 +182,6 @@ def test_is_sent_start(en_tokenizer):
|
|||
assert doc[5].is_sent_start is None
|
||||
doc[5].is_sent_start = True
|
||||
assert doc[5].is_sent_start is True
|
||||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
|
@ -184,7 +190,6 @@ def test_is_sent_end(en_tokenizer):
|
|||
assert doc[4].is_sent_end is None
|
||||
doc[5].is_sent_start = True
|
||||
assert doc[4].is_sent_end is True
|
||||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
|
@ -209,39 +214,39 @@ def test_token0_has_sent_start_true():
|
|||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_start is True
|
||||
assert doc[1].is_sent_start is None
|
||||
assert not doc.is_sentenced
|
||||
assert not doc.has_annotation("SENT_START")
|
||||
|
||||
|
||||
def test_tokenlast_has_sent_end_true():
|
||||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_end is None
|
||||
assert doc[1].is_sent_end is True
|
||||
assert not doc.is_sentenced
|
||||
assert not doc.has_annotation("SENT_START")
|
||||
|
||||
|
||||
def test_token_api_conjuncts_chain(en_vocab):
|
||||
words = "The boy and the girl and the man went .".split()
|
||||
heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1]
|
||||
words = ["The", "boy", "and", "the", "girl", "and", "the", "man", "went", "."]
|
||||
heads = [1, 8, 1, 4, 1, 4, 7, 4, 8, 8]
|
||||
deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[1].conjuncts] == ["girl", "man"]
|
||||
assert [w.text for w in doc[4].conjuncts] == ["boy", "man"]
|
||||
assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"]
|
||||
|
||||
|
||||
def test_token_api_conjuncts_simple(en_vocab):
|
||||
words = "They came and went .".split()
|
||||
heads = [1, 0, -1, -2, -1]
|
||||
words = ["They", "came", "and", "went", "."]
|
||||
heads = [1, 1, 1, 1, 3]
|
||||
deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[1].conjuncts] == ["went"]
|
||||
assert [w.text for w in doc[3].conjuncts] == ["came"]
|
||||
|
||||
|
||||
def test_token_api_non_conjuncts(en_vocab):
|
||||
words = "They came .".split()
|
||||
heads = [1, 0, -1]
|
||||
words = ["They", "came", "."]
|
||||
heads = [1, 1, 1]
|
||||
deps = ["nsubj", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[0].conjuncts] == []
|
||||
assert [w.text for w in doc[1].conjuncts] == []
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = de_tokenizer("Er lag auf seinem")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -1,30 +1,26 @@
|
|||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
def test_de_parser_noun_chunks_standard_de(de_vocab):
|
||||
words = ["Eine", "Tasse", "steht", "auf", "dem", "Tisch", "."]
|
||||
heads = [1, 2, 2, 2, 5, 3, 2]
|
||||
pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
assert chunks[1].text_with_ws == "dem Tisch "
|
||||
|
||||
|
||||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
def test_de_extended_chunk(de_vocab):
|
||||
# fmt: off
|
||||
words = ["Die", "Sängerin", "singt", "mit", "einer", "Tasse", "Kaffee", "Arien", "."]
|
||||
heads = [1, 2, 2, 2, 5, 3, 5, 2, 2]
|
||||
pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "NOUN", "NOUN", "PUNCT"]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
# fmt: on
|
||||
doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -2,30 +2,23 @@ import numpy
|
|||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
|
||||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
from ...util import get_doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = en_tokenizer("This is a sentence")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
||||
def test_en_noun_chunks_not_nested(en_vocab):
|
||||
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
||||
heads = [1, 0, 4, 3, -1, -2, -5]
|
||||
heads = [1, 1, 6, 6, 3, 3, 1]
|
||||
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc.from_array(
|
||||
[HEAD, DEP],
|
||||
numpy.asarray(
|
||||
|
|
|
@ -1,63 +1,51 @@
|
|||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
||||
text = "A base phrase should be recognized."
|
||||
heads = [2, 1, 3, 2, 1, 0, -1]
|
||||
def test_en_parser_noun_chunks_standard(en_vocab):
|
||||
words = ["A", "base", "phrase", "should", "be", "recognized", "."]
|
||||
heads = [2, 2, 5, 5, 5, 5, 5]
|
||||
pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
|
||||
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
||||
def test_en_parser_noun_chunks_coordinated(en_vocab):
|
||||
# fmt: off
|
||||
text = "A base phrase and a good phrase are often the same."
|
||||
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
||||
words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
|
||||
heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
|
||||
pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
|
||||
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
assert chunks[1].text_with_ws == "a good phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||
text = "A phrase with another phrase occurs."
|
||||
heads = [1, 4, -1, 1, -2, 0, -1]
|
||||
def test_en_parser_noun_chunks_pp_chunks(en_vocab):
|
||||
words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
|
||||
heads = [1, 5, 1, 4, 2, 5, 5]
|
||||
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A phrase "
|
||||
assert chunks[1].text_with_ws == "another phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
||||
def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
|
||||
# fmt: off
|
||||
text = "Sam, my brother, arrived to the house."
|
||||
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
||||
words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
|
||||
heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
|
||||
pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Sam "
|
||||
|
@ -65,15 +53,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
|||
assert chunks[2].text_with_ws == "the house "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
||||
text = "She gave Bob a raise."
|
||||
heads = [1, 0, -1, 1, -3, -4]
|
||||
def test_en_parser_noun_chunks_dative(en_vocab):
|
||||
words = ["She", "gave", "Bob", "a", "raise", "."]
|
||||
heads = [1, 1, 1, 4, 1, 1]
|
||||
pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "She "
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ...util import get_doc, apply_transition_sequence
|
||||
from ...util import apply_transition_sequence
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["A test sentence"])
|
||||
@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
|
||||
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
tokens = en_tokenizer(text + punct)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
def test_en_sbd_single_punct(en_vocab, words, punct):
|
||||
heads = [2, 2, 2, 2] if punct else [2, 2, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
words = [*words, punct] if punct else words
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert len(doc) == 4 if punct else 3
|
||||
assert len(list(doc.sents)) == 1
|
||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
@ -17,17 +19,16 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||
def test_en_sentence_breaks(en_vocab, en_parser):
|
||||
# fmt: off
|
||||
text = "This is a sentence . This is another one ."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||
words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "one", "."]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct"]
|
||||
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
|
||||
"L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
assert len(list(doc.sents)) == 2
|
||||
for token in doc:
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = es_tokenizer("en Oxford este verano")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,12 +3,8 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
|
||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = id_tokenizer("sebelas")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||
|
@ -11,7 +10,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
|
|||
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = Doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = ru_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыть", "рама"]
|
||||
|
@ -28,7 +27,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
|
|||
],
|
||||
)
|
||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert sorted(result_lemmas) == lemmas
|
||||
|
||||
|
@ -51,7 +50,7 @@ def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
|||
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
||||
ru_lemmatizer, text, pos, morph, lemma
|
||||
):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
@ -66,13 +65,13 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
|||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
||||
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
import pytest
|
||||
|
||||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
@ -20,21 +15,21 @@ SV_NP_TEST_EXAMPLES = [
|
|||
"En student läste en bok", # A student read a book
|
||||
["DET", "NOUN", "VERB", "DET", "NOUN"],
|
||||
["det", "nsubj", "ROOT", "det", "dobj"],
|
||||
[1, 1, 0, 1, -2],
|
||||
[1, 2, 2, 4, 2],
|
||||
["En student", "en bok"],
|
||||
),
|
||||
(
|
||||
"Studenten läste den bästa boken.", # The student read the best book
|
||||
["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
|
||||
["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
|
||||
[1, 0, 2, 1, -3, -4],
|
||||
[1, 1, 4, 4, 1, 1],
|
||||
["Studenten", "den bästa boken"],
|
||||
),
|
||||
(
|
||||
"De samvetslösa skurkarna hade stulit de största juvelerna på söndagen", # The remorseless crooks had stolen the largest jewels that sunday
|
||||
["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
|
||||
["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
|
||||
[2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
|
||||
[2, 2, 4, 4, 4, 7, 7, 4, 9, 4],
|
||||
["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
|
||||
),
|
||||
]
|
||||
|
@ -45,12 +40,9 @@ SV_NP_TEST_EXAMPLES = [
|
|||
)
|
||||
def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||
tokens = sv_tokenizer(text)
|
||||
|
||||
assert len(heads) == len(pos)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
|
||||
)
|
||||
|
||||
words = [t.text for t in tokens]
|
||||
doc = Doc(tokens.vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||
for i, np in enumerate(noun_chunks):
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
|||
# Only include languages with no external dependencies
|
||||
# excluded: ru, uk
|
||||
# excluded for custom tables: pl
|
||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -4,16 +4,15 @@ import re
|
|||
import copy
|
||||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_vocab):
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
|
||||
heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
return doc
|
||||
return Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -236,10 +235,10 @@ def test_dependency_matcher_callback(en_vocab, doc):
|
|||
@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
|
||||
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||
# two sentences to test that all matches are within the same sentence
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
en_vocab,
|
||||
words=["a", "b", "c", "d", "e"] * 2,
|
||||
heads=[0, -1, -2, -3, -4] * 2,
|
||||
heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
|
||||
deps=["dep"] * 10,
|
||||
)
|
||||
match_count = 0
|
||||
|
|
|
@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
|
|||
|
||||
def test_attr_pipeline_checks(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires is_parsed
|
||||
# DEP requires DEP
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"DEP": "a"}]])
|
||||
matcher(doc1)
|
||||
|
@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher(doc2)
|
||||
with pytest.raises(ValueError):
|
||||
matcher(doc3)
|
||||
# TAG, POS, LEMMA require is_tagged
|
||||
# TAG, POS, LEMMA require those values
|
||||
for attr in ("TAG", "POS", "LEMMA"):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{attr: "a"}]])
|
||||
|
|
|
@ -3,7 +3,6 @@ import srsly
|
|||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
|
@ -140,10 +139,10 @@ def test_phrase_matcher_string_attrs(en_vocab):
|
|||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
|
||||
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
doc = Doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
match_id, start, end = matches[0]
|
||||
|
@ -158,10 +157,10 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
|
|||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
|
||||
pos2 = ["X", "X", "X"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
doc = Doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
@ -187,9 +186,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
|
|||
|
||||
def test_phrase_matcher_validation(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -212,18 +213,21 @@ def test_attr_validation(en_vocab):
|
|||
|
||||
def test_attr_pipeline_checks(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires is_parsed
|
||||
# DEP requires DEP
|
||||
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
||||
matcher.add("TEST1", [doc1])
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST2", [doc2])
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST3", [doc3])
|
||||
# TAG, POS, LEMMA require is_tagged
|
||||
# TAG, POS, LEMMA require those values
|
||||
for attr in ("TAG", "POS", "LEMMA"):
|
||||
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||
matcher.add("TEST2", [doc2])
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
|
||||
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
|
||||
from spacy.pipeline._parser_internals import nonproj
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -74,16 +73,10 @@ def test_parser_is_nonproj_tree(
|
|||
assert is_nonproj_tree(multirooted_tree) is True
|
||||
|
||||
|
||||
def test_parser_pseudoprojectivity(en_tokenizer):
|
||||
def test_parser_pseudoprojectivity(en_vocab):
|
||||
def deprojectivize(proj_heads, deco_labels):
|
||||
tokens = en_tokenizer("whatever " * len(proj_heads))
|
||||
rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
deps=deco_labels,
|
||||
heads=rel_proj_heads,
|
||||
)
|
||||
words = ["whatever "] * len(proj_heads)
|
||||
doc = Doc(en_vocab, words=words, deps=deco_labels, heads=proj_heads)
|
||||
nonproj.deprojectivize(doc)
|
||||
return [t.head.i for t in doc], [token.dep_ for token in doc]
|
||||
|
||||
|
@ -94,49 +87,39 @@ def test_parser_pseudoprojectivity(en_tokenizer):
|
|||
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
|
||||
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
|
||||
# fmt: on
|
||||
|
||||
assert nonproj.decompose("X||Y") == ("X", "Y")
|
||||
assert nonproj.decompose("X") == ("X", "")
|
||||
assert nonproj.is_decorated("X||Y") is True
|
||||
assert nonproj.is_decorated("X") is False
|
||||
|
||||
nonproj._lift(0, tree)
|
||||
assert tree == [2, 2, 2]
|
||||
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
|
||||
|
||||
# fmt: off
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl||dobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == nonproj_tree
|
||||
assert undeco_labels == labels
|
||||
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
|
||||
assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
|
||||
"det", "dobj", "det", "nmod", "aux", "nmod||dobj",
|
||||
"advmod", "det", "amod", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == nonproj_tree2
|
||||
assert undeco_labels == labels2
|
||||
|
||||
# if decoration is wrong such that there is no head with the desired label
|
||||
# the structure is kept and the label is undecorated
|
||||
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
|
||||
"acl||iobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == proj_heads
|
||||
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl", "punct"]
|
||||
|
||||
# if there are two potential new heads, the first one is chosen even if
|
||||
# it"s wrong
|
||||
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user