2019-04-16 12:58:31 +03:00
|
|
|
import ctypes
|
2023-06-14 18:48:41 +03:00
|
|
|
import os
|
2018-07-25 00:38:44 +03:00
|
|
|
from pathlib import Path
|
2023-06-14 18:48:41 +03:00
|
|
|
|
|
|
|
import pytest
|
2023-08-08 12:27:28 +03:00
|
|
|
|
|
|
|
try:
|
|
|
|
from pydantic.v1 import ValidationError
|
|
|
|
except ImportError:
|
|
|
|
from pydantic import ValidationError # type: ignore
|
|
|
|
|
2023-06-14 18:48:41 +03:00
|
|
|
from thinc.api import (
|
|
|
|
Config,
|
|
|
|
ConfigValidationError,
|
|
|
|
CupyOps,
|
|
|
|
MPSOps,
|
|
|
|
NumpyOps,
|
|
|
|
Optimizer,
|
|
|
|
get_current_ops,
|
|
|
|
set_current_ops,
|
|
|
|
)
|
2022-08-30 15:21:02 +03:00
|
|
|
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
2023-06-14 18:48:41 +03:00
|
|
|
|
|
|
|
from spacy import prefer_gpu, require_cpu, require_gpu, util
|
|
|
|
from spacy.about import __version__ as spacy_version
|
2020-10-05 14:45:57 +03:00
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.lang.nl import Dutch
|
|
|
|
from spacy.language import DEFAULT_CONFIG_PATH
|
2023-06-14 18:48:41 +03:00
|
|
|
from spacy.ml._precomputable_affine import (
|
|
|
|
PrecomputableAffine,
|
|
|
|
_backprop_precomputable_affine_padding,
|
|
|
|
)
|
2021-12-04 22:34:48 +03:00
|
|
|
from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
|
2023-06-14 18:48:41 +03:00
|
|
|
from spacy.training.batchers import minibatch_by_words
|
|
|
|
from spacy.util import (
|
|
|
|
SimpleFrozenList,
|
|
|
|
dot_to_object,
|
|
|
|
find_available_port,
|
|
|
|
import_file,
|
|
|
|
to_ternary_int,
|
|
|
|
)
|
2020-12-08 09:42:40 +03:00
|
|
|
|
2021-03-01 19:54:14 +03:00
|
|
|
from .util import get_random_doc, make_tempdir
|
2019-04-01 13:11:27 +03:00
|
|
|
|
2019-03-15 20:14:46 +03:00
|
|
|
|
2019-04-16 12:58:31 +03:00
|
|
|
@pytest.fixture
|
|
|
|
def is_admin():
|
|
|
|
"""Determine if the tests are run as admin or not."""
|
|
|
|
try:
|
|
|
|
admin = os.getuid() == 0
|
|
|
|
except AttributeError:
|
|
|
|
admin = ctypes.windll.shell32.IsUserAnAdmin() != 0
|
|
|
|
|
|
|
|
return admin
|
|
|
|
|
|
|
|
|
2021-12-04 22:34:48 +03:00
|
|
|
@pytest.mark.issue(6207)
|
|
|
|
def test_issue6207(en_tokenizer):
|
|
|
|
doc = en_tokenizer("zero one two three four five six")
|
|
|
|
|
|
|
|
# Make spans
|
|
|
|
s1 = doc[:4]
|
|
|
|
s2 = doc[3:6] # overlaps with s1
|
|
|
|
s3 = doc[5:7] # overlaps with s2, not s1
|
|
|
|
|
|
|
|
result = util.filter_spans((s1, s2, s3))
|
|
|
|
assert s1 in result
|
|
|
|
assert s2 not in result
|
|
|
|
assert s3 in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(6258)
|
|
|
|
def test_issue6258():
|
|
|
|
"""Test that the non-empty constraint pattern field is respected"""
|
|
|
|
# These one is valid
|
|
|
|
TokenPatternSchema(pattern=[TokenPattern()])
|
|
|
|
# But an empty pattern list should fail to validate
|
|
|
|
# based on the schema's constraint
|
|
|
|
with pytest.raises(ValidationError):
|
|
|
|
TokenPatternSchema(pattern=[])
|
|
|
|
|
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
|
2017-04-23 22:06:46 +03:00
|
|
|
def test_util_ensure_path_succeeds(text):
|
2017-05-29 11:51:19 +03:00
|
|
|
path = util.ensure_path(text)
|
2017-04-23 22:06:46 +03:00
|
|
|
assert isinstance(path, Path)
|
2017-05-29 02:37:57 +03:00
|
|
|
|
|
|
|
|
2020-05-24 15:55:16 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
|
|
|
|
)
|
|
|
|
def test_util_is_package(package, result):
|
2017-05-29 11:51:19 +03:00
|
|
|
"""Test that an installed package via pip is recognised by util.is_package."""
|
2020-05-24 15:55:16 +03:00
|
|
|
assert util.is_package(package) is result
|
2017-05-29 11:51:19 +03:00
|
|
|
|
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
@pytest.mark.parametrize("package", ["thinc"])
|
2017-05-29 11:51:19 +03:00
|
|
|
def test_util_get_package_path(package):
|
|
|
|
"""Test that a Path object is returned for a package name."""
|
|
|
|
path = util.get_package_path(package)
|
|
|
|
assert isinstance(path, Path)
|
|
|
|
|
|
|
|
|
2017-11-03 02:48:54 +03:00
|
|
|
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
2020-05-18 23:23:33 +03:00
|
|
|
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
|
2020-01-29 19:06:46 +03:00
|
|
|
assert model.get_param("W").shape == (nF, nO, nP, nI)
|
|
|
|
tensor = model.ops.alloc((10, nI))
|
2017-11-03 02:48:54 +03:00
|
|
|
Y, get_dX = model.begin_update(tensor)
|
2018-11-27 03:09:36 +03:00
|
|
|
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
2020-01-29 19:06:46 +03:00
|
|
|
dY = model.ops.alloc((15, nO, nP))
|
|
|
|
ids = model.ops.alloc((15, nF))
|
2018-11-27 03:09:36 +03:00
|
|
|
ids[1, 2] = -1
|
2017-11-03 16:04:16 +03:00
|
|
|
dY[1] = 1
|
2020-01-29 19:06:46 +03:00
|
|
|
assert not model.has_grad("pad")
|
|
|
|
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
|
|
|
assert d_pad[0, 2, 0, 0] == 1.0
|
2018-11-27 03:09:36 +03:00
|
|
|
ids.fill(0.0)
|
|
|
|
dY.fill(0.0)
|
2020-01-29 19:06:46 +03:00
|
|
|
dY[0] = 0
|
|
|
|
ids[1, 2] = 0
|
2018-11-27 03:09:36 +03:00
|
|
|
ids[1, 1] = -1
|
|
|
|
ids[1, 0] = -1
|
2017-11-03 16:04:16 +03:00
|
|
|
dY[1] = 1
|
2020-01-29 19:06:46 +03:00
|
|
|
ids[2, 0] = -1
|
|
|
|
dY[2] = 5
|
|
|
|
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
|
|
|
assert d_pad[0, 0, 0, 0] == 6
|
|
|
|
assert d_pad[0, 1, 0, 0] == 1
|
|
|
|
assert d_pad[0, 2, 0, 0] == 0
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_prefer_gpu():
|
2021-04-22 15:58:29 +03:00
|
|
|
current_ops = get_current_ops()
|
2022-08-30 15:21:02 +03:00
|
|
|
if has_cupy_gpu:
|
|
|
|
assert prefer_gpu()
|
2020-12-08 09:42:40 +03:00
|
|
|
assert isinstance(get_current_ops(), CupyOps)
|
2022-08-30 15:21:02 +03:00
|
|
|
elif has_torch_mps_gpu:
|
|
|
|
assert prefer_gpu()
|
|
|
|
assert isinstance(get_current_ops(), MPSOps)
|
|
|
|
else:
|
2019-10-28 00:19:18 +03:00
|
|
|
assert not prefer_gpu()
|
2021-04-22 15:58:29 +03:00
|
|
|
set_current_ops(current_ops)
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_require_gpu():
|
2021-04-22 15:58:29 +03:00
|
|
|
current_ops = get_current_ops()
|
2022-08-30 15:21:02 +03:00
|
|
|
if has_cupy_gpu:
|
2020-12-08 09:42:40 +03:00
|
|
|
require_gpu()
|
|
|
|
assert isinstance(get_current_ops(), CupyOps)
|
2022-08-30 15:21:02 +03:00
|
|
|
elif has_torch_mps_gpu:
|
|
|
|
require_gpu()
|
|
|
|
assert isinstance(get_current_ops(), MPSOps)
|
2021-04-22 15:58:29 +03:00
|
|
|
set_current_ops(current_ops)
|
2019-03-15 20:14:46 +03:00
|
|
|
|
2021-01-05 05:41:53 +03:00
|
|
|
|
2020-12-08 09:42:40 +03:00
|
|
|
def test_require_cpu():
|
2021-04-22 15:58:29 +03:00
|
|
|
current_ops = get_current_ops()
|
2020-12-08 09:42:40 +03:00
|
|
|
require_cpu()
|
|
|
|
assert isinstance(get_current_ops(), NumpyOps)
|
|
|
|
try:
|
|
|
|
import cupy # noqa: F401
|
2021-01-05 05:41:53 +03:00
|
|
|
|
2020-12-08 09:42:40 +03:00
|
|
|
require_gpu()
|
|
|
|
assert isinstance(get_current_ops(), CupyOps)
|
|
|
|
except ImportError:
|
|
|
|
pass
|
|
|
|
require_cpu()
|
|
|
|
assert isinstance(get_current_ops(), NumpyOps)
|
2021-04-22 15:58:29 +03:00
|
|
|
set_current_ops(current_ops)
|
2020-12-08 09:42:40 +03:00
|
|
|
|
2019-03-15 20:14:46 +03:00
|
|
|
|
2019-09-30 19:45:30 +03:00
|
|
|
def test_ascii_filenames():
|
|
|
|
"""Test that all filenames in the project are ASCII.
|
|
|
|
See: https://twitter.com/_inesmontani/status/1177941471632211968
|
|
|
|
"""
|
|
|
|
root = Path(__file__).parent.parent
|
|
|
|
for path in root.glob("**/*"):
|
|
|
|
assert all(ord(c) < 128 for c in path.name), path.name
|
2020-05-21 21:24:07 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_load_model_blank_shortcut():
|
|
|
|
"""Test that using a model name like "blank:en" works as a shortcut for
|
|
|
|
spacy.blank("en").
|
|
|
|
"""
|
|
|
|
nlp = util.load_model("blank:en")
|
|
|
|
assert nlp.lang == "en"
|
|
|
|
assert nlp.pipeline == []
|
2021-10-05 10:52:22 +03:00
|
|
|
|
|
|
|
# ImportError for loading an unsupported language
|
|
|
|
with pytest.raises(ImportError):
|
|
|
|
util.load_model("blank:zxx")
|
|
|
|
|
|
|
|
# ImportError for requesting an invalid language code that isn't registered
|
2020-05-21 21:24:07 +03:00
|
|
|
with pytest.raises(ImportError):
|
|
|
|
util.load_model("blank:fjsfijsdof")
|
2020-06-02 18:23:16 +03:00
|
|
|
|
|
|
|
|
2020-05-22 17:55:15 +03:00
|
|
|
@pytest.mark.parametrize(
|
2020-05-30 16:18:53 +03:00
|
|
|
"version,constraint,compatible",
|
2020-05-30 16:01:58 +03:00
|
|
|
[
|
2020-05-30 16:18:53 +03:00
|
|
|
(spacy_version, spacy_version, True),
|
|
|
|
(spacy_version, f">={spacy_version}", True),
|
|
|
|
("3.0.0", "2.0.0", False),
|
|
|
|
("3.2.1", ">=2.0.0", True),
|
|
|
|
("2.2.10a1", ">=1.0.0,<2.1.1", False),
|
|
|
|
("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
|
|
|
|
("n/a", ">=1.2.3,<4.5.6", None),
|
|
|
|
("1.2.3", "n/a", None),
|
|
|
|
("n/a", "n/a", None),
|
2020-05-30 16:01:58 +03:00
|
|
|
],
|
2020-05-22 17:55:15 +03:00
|
|
|
)
|
2020-05-30 16:18:53 +03:00
|
|
|
def test_is_compatible_version(version, constraint, compatible):
|
|
|
|
assert util.is_compatible_version(version, constraint) is compatible
|
2020-06-05 13:42:15 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"constraint,expected",
|
|
|
|
[
|
|
|
|
("3.0.0", False),
|
|
|
|
("==3.0.0", False),
|
|
|
|
(">=2.3.0", True),
|
|
|
|
(">2.0.0", True),
|
|
|
|
("<=2.0.0", True),
|
|
|
|
(">2.0.0,<3.0.0", False),
|
|
|
|
(">=2.0.0,<3.0.0", False),
|
|
|
|
("!=1.1,>=1.0,~=1.0", True),
|
|
|
|
("n/a", None),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_is_unconstrained_version(constraint, expected):
|
|
|
|
assert util.is_unconstrained_version(constraint) is expected
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
|
2020-10-05 14:53:07 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"a1,a2,b1,b2,is_match",
|
|
|
|
[
|
|
|
|
("3.0.0", "3.0", "3.0.1", "3.0", True),
|
|
|
|
("3.1.0", "3.1", "3.2.1", "3.2", False),
|
|
|
|
("xxx", None, "1.2.3.dev0", "1.2", False),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_minor_version(a1, a2, b1, b2, is_match):
|
|
|
|
assert util.get_minor_version(a1) == a2
|
|
|
|
assert util.get_minor_version(b1) == b2
|
|
|
|
assert util.is_minor_version_match(a1, b1) is is_match
|
|
|
|
assert util.is_minor_version_match(a2, b2) is is_match
|
|
|
|
|
|
|
|
|
2020-07-22 14:42:59 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"dot_notation,expected",
|
|
|
|
[
|
|
|
|
(
|
|
|
|
{"token.pos": True, "token._.xyz": True},
|
|
|
|
{"token": {"pos": True, "_": {"xyz": True}}},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
|
|
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
|
|
|
),
|
2023-06-27 18:36:33 +03:00
|
|
|
(
|
|
|
|
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
|
|
|
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
|
|
|
),
|
2020-07-22 14:42:59 +03:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_dot_to_dict(dot_notation, expected):
|
|
|
|
result = util.dot_to_dict(dot_notation)
|
|
|
|
assert result == expected
|
|
|
|
assert util.dict_to_dot(result) == dot_notation
|
2020-10-05 14:45:57 +03:00
|
|
|
|
|
|
|
|
2023-06-27 18:36:33 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"dot_notation,expected",
|
|
|
|
[
|
|
|
|
(
|
|
|
|
{"token.pos": True, "token._.xyz": True},
|
|
|
|
{"token": {"pos": True, "_": {"xyz": True}}},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
|
|
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
|
|
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_dot_to_dict_overrides(dot_notation, expected):
|
|
|
|
result = util.dot_to_dict(dot_notation)
|
|
|
|
assert result == expected
|
|
|
|
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
|
|
|
|
|
|
|
|
|
2021-01-29 07:57:04 +03:00
|
|
|
def test_set_dot_to_object():
|
|
|
|
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
util.set_dot_to_object(config, "foo.bar.baz", 100)
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
util.set_dot_to_object(config, "hello.world", 100)
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
util.set_dot_to_object(config, "test.a.b.c", 100)
|
|
|
|
util.set_dot_to_object(config, "foo.bar", 100)
|
|
|
|
assert config["foo"]["bar"] == 100
|
|
|
|
util.set_dot_to_object(config, "foo.baz.x", {"hello": "world"})
|
|
|
|
assert config["foo"]["baz"]["x"]["hello"] == "world"
|
|
|
|
assert config["test"]["a"]["b"] == "c"
|
|
|
|
util.set_dot_to_object(config, "foo", 123)
|
|
|
|
assert config["foo"] == 123
|
|
|
|
util.set_dot_to_object(config, "test", "hello")
|
|
|
|
assert dict(config) == {"foo": 123, "test": "hello"}
|
|
|
|
|
|
|
|
|
2020-10-05 14:45:57 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"doc_sizes, expected_batches",
|
|
|
|
[
|
|
|
|
([400, 400, 199], [3]),
|
|
|
|
([400, 400, 199, 3], [4]),
|
|
|
|
([400, 400, 199, 3, 200], [3, 2]),
|
|
|
|
([400, 400, 199, 3, 1], [5]),
|
|
|
|
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
|
|
|
([400, 400, 199, 3, 1, 200], [3, 3]),
|
|
|
|
([400, 400, 199, 3, 1, 999], [3, 3]),
|
|
|
|
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
|
|
|
([1, 2, 999], [3]),
|
|
|
|
([1, 2, 999, 1], [4]),
|
|
|
|
([1, 200, 999, 1], [2, 2]),
|
|
|
|
([1, 999, 200, 1], [2, 2]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_util_minibatch(doc_sizes, expected_batches):
|
|
|
|
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
|
|
|
tol = 0.2
|
|
|
|
batch_size = 1000
|
|
|
|
batches = list(
|
|
|
|
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
|
|
|
|
)
|
|
|
|
assert [len(batch) for batch in batches] == expected_batches
|
|
|
|
|
|
|
|
max_size = batch_size + batch_size * tol
|
|
|
|
for batch in batches:
|
|
|
|
assert sum([len(doc) for doc in batch]) < max_size
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"doc_sizes, expected_batches",
|
|
|
|
[
|
|
|
|
([400, 4000, 199], [1, 2]),
|
|
|
|
([400, 400, 199, 3000, 200], [1, 4]),
|
|
|
|
([400, 400, 199, 3, 1, 1500], [1, 5]),
|
|
|
|
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
|
|
|
|
([1, 2, 9999], [1, 2]),
|
|
|
|
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
2021-07-02 10:48:26 +03:00
|
|
|
"""Test that oversized documents are returned in their own batch"""
|
2020-10-05 14:45:57 +03:00
|
|
|
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
|
|
|
tol = 0.2
|
|
|
|
batch_size = 1000
|
|
|
|
batches = list(
|
|
|
|
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
|
|
|
|
)
|
|
|
|
assert [len(batch) for batch in batches] == expected_batches
|
|
|
|
|
|
|
|
|
|
|
|
def test_util_dot_section():
|
|
|
|
cfg_string = """
|
|
|
|
[nlp]
|
|
|
|
lang = "en"
|
|
|
|
pipeline = ["textcat"]
|
|
|
|
|
|
|
|
[components]
|
|
|
|
|
|
|
|
[components.textcat]
|
|
|
|
factory = "textcat"
|
|
|
|
|
|
|
|
[components.textcat.model]
|
2023-11-29 11:11:54 +03:00
|
|
|
@architectures = "spacy.TextCatBOW.v3"
|
2020-10-05 14:45:57 +03:00
|
|
|
exclusive_classes = true
|
2023-11-29 11:11:54 +03:00
|
|
|
length = 262144
|
2020-10-05 14:45:57 +03:00
|
|
|
ngram_size = 1
|
|
|
|
no_output_layer = false
|
|
|
|
"""
|
|
|
|
nlp_config = Config().from_str(cfg_string)
|
|
|
|
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
|
|
|
|
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
|
|
|
default_config["nlp"]["lang"] = "nl"
|
|
|
|
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
|
|
|
|
# Test that creation went OK
|
|
|
|
assert isinstance(en_nlp, English)
|
|
|
|
assert isinstance(nl_nlp, Dutch)
|
|
|
|
assert nl_nlp.pipe_names == []
|
|
|
|
assert en_nlp.pipe_names == ["textcat"]
|
|
|
|
# not exclusive_classes
|
|
|
|
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
|
|
|
# Test that default values got overwritten
|
|
|
|
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
|
|
|
|
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
|
|
|
|
# Test proper functioning of 'dot_to_object'
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
|
|
|
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
|
|
|
|
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
|
|
|
|
|
|
|
|
|
|
|
|
def test_simple_frozen_list():
|
|
|
|
t = SimpleFrozenList(["foo", "bar"])
|
|
|
|
assert t == ["foo", "bar"]
|
|
|
|
assert t.index("bar") == 1 # okay method
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
t.append("baz")
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
t.sort()
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
t.extend(["baz"])
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
t.pop()
|
|
|
|
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
t.append("baz")
|
|
|
|
|
|
|
|
|
|
|
|
def test_resolve_dot_names():
|
|
|
|
config = {
|
|
|
|
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
|
|
|
|
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
|
|
|
|
}
|
|
|
|
result = util.resolve_dot_names(config, ["training.optimizer"])
|
|
|
|
assert isinstance(result[0], Optimizer)
|
|
|
|
with pytest.raises(ConfigValidationError) as e:
|
|
|
|
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
|
|
|
|
errors = e.value.errors
|
|
|
|
assert len(errors) == 1
|
|
|
|
assert errors[0]["loc"] == ["training", "xyz"]
|
2021-03-01 18:32:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_import_code():
|
|
|
|
code_str = """
|
|
|
|
from spacy import Language
|
|
|
|
|
|
|
|
class DummyComponent:
|
|
|
|
def __init__(self, vocab, name):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def initialize(self, get_examples, *, nlp, dummy_param: int):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@Language.factory(
|
|
|
|
"dummy_component",
|
|
|
|
)
|
|
|
|
def make_dummy_component(
|
|
|
|
nlp: Language, name: str
|
|
|
|
):
|
|
|
|
return DummyComponent(nlp.vocab, name)
|
|
|
|
"""
|
|
|
|
|
2021-03-01 19:54:14 +03:00
|
|
|
with make_tempdir() as temp_dir:
|
|
|
|
code_path = os.path.join(temp_dir, "code.py")
|
|
|
|
with open(code_path, "w") as fileh:
|
|
|
|
fileh.write(code_str)
|
2021-03-01 18:32:31 +03:00
|
|
|
|
2021-03-01 19:54:14 +03:00
|
|
|
import_file("python_code", code_path)
|
2021-03-01 18:32:31 +03:00
|
|
|
config = {"initialize": {"components": {"dummy_component": {"dummy_param": 1}}}}
|
|
|
|
nlp = English.from_config(config)
|
|
|
|
nlp.add_pipe("dummy_component")
|
|
|
|
nlp.initialize()
|
2021-04-29 17:58:54 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_to_ternary_int():
|
|
|
|
assert to_ternary_int(True) == 1
|
|
|
|
assert to_ternary_int(None) == 0
|
|
|
|
assert to_ternary_int(False) == -1
|
|
|
|
assert to_ternary_int(1) == 1
|
|
|
|
assert to_ternary_int(1.0) == 1
|
|
|
|
assert to_ternary_int(0) == 0
|
|
|
|
assert to_ternary_int(0.0) == 0
|
|
|
|
assert to_ternary_int(-1) == -1
|
|
|
|
assert to_ternary_int(5) == -1
|
|
|
|
assert to_ternary_int(-10) == -1
|
|
|
|
assert to_ternary_int("string") == -1
|
|
|
|
assert to_ternary_int([0, "string"]) == -1
|
2023-01-10 09:52:57 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_find_available_port():
|
|
|
|
host = "0.0.0.0"
|
|
|
|
port = 5000
|
|
|
|
assert find_available_port(port, host) == port, "Port 5000 isn't free"
|
|
|
|
|
2023-06-14 18:48:41 +03:00
|
|
|
from wsgiref.simple_server import demo_app, make_server
|
2023-01-10 09:52:57 +03:00
|
|
|
|
|
|
|
with make_server(host, port, demo_app) as httpd:
|
|
|
|
with pytest.warns(UserWarning, match="already in use"):
|
|
|
|
found_port = find_available_port(port, host, auto_select=True)
|
|
|
|
assert found_port == port + 1, "Didn't find next port"
|