mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #11958 from adrianeboyd/backport/v3.3.2
Backport bug fixes to v3.3.x
This commit is contained in:
commit
4e032da3b9
70
.github/azure-steps.yml
vendored
70
.github/azure-steps.yml
vendored
|
@ -1,9 +1,6 @@
|
||||||
parameters:
|
parameters:
|
||||||
python_version: ''
|
python_version: ''
|
||||||
architecture: ''
|
architecture: 'x64'
|
||||||
prefix: ''
|
|
||||||
gpu: false
|
|
||||||
num_build_jobs: 1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
|
@ -16,16 +13,16 @@ steps:
|
||||||
displayName: 'Set variables'
|
displayName: 'Set variables'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
python -m pip install -U build pip setuptools
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install dependencies"
|
displayName: "Install dependencies"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
python -m build --sdist
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
displayName: "Build sdist"
|
||||||
displayName: "Compile and build sdist"
|
|
||||||
|
|
||||||
- script: python -m mypy spacy
|
- script: |
|
||||||
|
python -m mypy spacy
|
||||||
displayName: 'Run mypy'
|
displayName: 'Run mypy'
|
||||||
condition: ne(variables['python_version'], '3.10')
|
condition: ne(variables['python_version'], '3.10')
|
||||||
|
|
||||||
|
@ -34,35 +31,24 @@ steps:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
displayName: "Delete source directory"
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
|
- task: DeleteFiles@1
|
||||||
|
inputs:
|
||||||
|
contents: "*.egg-info"
|
||||||
|
displayName: "Delete egg-info directory"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
python -m pip freeze > installed.txt
|
||||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
python -m pip uninstall -y -r installed.txt
|
||||||
displayName: "Uninstall all packages"
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
python -m pip install dist/$SDIST
|
||||||
displayName: "Install from sdist"
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -W error -c "import spacy"
|
||||||
displayName: "Install test requirements"
|
displayName: "Test import"
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
|
||||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
|
||||||
displayName: "Install GPU requirements"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
|
||||||
displayName: "Run GPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy download ca_core_news_sm
|
python -m spacy download ca_core_news_sm
|
||||||
|
@ -105,13 +91,21 @@ steps:
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
displayName: "Install test requirements"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
displayName: "Run CPU tests"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install 'spacy[apple]'
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
displayName: 'Test website/meta/universe.json'
|
displayName: 'Test website/meta/universe.json'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pip install thinc-apple-ops
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ repos:
|
||||||
language_version: python3.7
|
language_version: python3.7
|
||||||
additional_dependencies: ['click==8.0.4']
|
additional_dependencies: ['click==8.0.4']
|
||||||
- repo: https://gitlab.com/pycqa/flake8
|
- repo: https://gitlab.com/pycqa/flake8
|
||||||
rev: 3.9.2
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
args:
|
args:
|
||||||
|
|
|
@ -31,7 +31,7 @@ jobs:
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.9.2
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-latest"
|
imageName: "ubuntu-20.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "windows-latest"
|
# imageName: "windows-latest"
|
||||||
|
@ -50,7 +50,7 @@ jobs:
|
||||||
# imageName: "macos-latest"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-latest"
|
# imageName: "ubuntu-20.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "windows-latest"
|
imageName: "windows-latest"
|
||||||
|
@ -92,20 +92,3 @@ jobs:
|
||||||
- template: .github/azure-steps.yml
|
- template: .github/azure-steps.yml
|
||||||
parameters:
|
parameters:
|
||||||
python_version: '$(python.version)'
|
python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
|
||||||
|
|
||||||
# - job: "TestGPU"
|
|
||||||
# dependsOn: "Validate"
|
|
||||||
# strategy:
|
|
||||||
# matrix:
|
|
||||||
# Python38LinuxX64_GPU:
|
|
||||||
# python.version: '3.8'
|
|
||||||
# pool:
|
|
||||||
# name: "LinuxX64_GPU"
|
|
||||||
# steps:
|
|
||||||
# - template: .github/azure-steps.yml
|
|
||||||
# parameters:
|
|
||||||
# python_version: '$(python.version)'
|
|
||||||
# architecture: 'x64'
|
|
||||||
# gpu: true
|
|
||||||
# num_build_jobs: 24
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
|
@ -51,9 +51,10 @@ install_requires =
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third-party dependencies
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.3.1"
|
__version__ = "3.3.2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
||||||
if dest.exists() and not force:
|
if dest.exists() and not force:
|
||||||
return None
|
return None
|
||||||
src = str(src)
|
src = str(src)
|
||||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||||
with dest.open(mode="wb") as output_file:
|
with dest.open(mode="wb") as output_file:
|
||||||
shutil.copyfileobj(input_file, output_file)
|
shutil.copyfileobj(input_file, output_file)
|
||||||
|
|
||||||
|
|
|
@ -227,12 +227,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||||
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||||
}
|
}
|
||||||
for span in doc.spans[spans_key]
|
for span in doc.spans.get(spans_key, [])
|
||||||
]
|
]
|
||||||
tokens = [token.text for token in doc]
|
tokens = [token.text for token in doc]
|
||||||
|
|
||||||
if not spans:
|
if not spans:
|
||||||
warnings.warn(Warnings.W117.format(spans_key=spans_key))
|
keys = list(doc.spans.keys())
|
||||||
|
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
|
||||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
settings = get_doc_settings(doc)
|
settings = get_doc_settings(doc)
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -195,7 +195,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||||
"surprising to you, make sure the Doc was processed using a model "
|
"surprising to you, make sure the Doc was processed using a model "
|
||||||
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||||
"property manually if necessary.")
|
"property manually if necessary.\n\nAvailable keys: {keys}")
|
||||||
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||||
"for the corpora used to train the language. Please check "
|
"for the corpora used to train the language. Please check "
|
||||||
"`nlp.meta[\"sources\"]` for any relevant links.")
|
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||||
|
@ -335,6 +335,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
|
|
@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
["·", "ㆍ", "\(", "\)"]
|
["·", "ㆍ", r"\(", r"\)"]
|
||||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ BASE_TOKENIZER_INFIXES
|
+ BASE_TOKENIZER_INFIXES
|
||||||
|
|
|
@ -22,9 +22,15 @@ def forward(model, X, is_train):
|
||||||
nP = model.get_dim("nP")
|
nP = model.get_dim("nP")
|
||||||
nI = model.get_dim("nI")
|
nI = model.get_dim("nI")
|
||||||
W = model.get_param("W")
|
W = model.get_param("W")
|
||||||
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
# Preallocate array for layer output, including padding.
|
||||||
|
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP)
|
||||||
|
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
|
||||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
|
||||||
|
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
|
||||||
|
# change its shape to (nF, nO, nP) without breaking existing models. So
|
||||||
|
# we'll squeeze the first dimension here.
|
||||||
|
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
|
||||||
|
|
||||||
def backward(dY_ids):
|
def backward(dY_ids):
|
||||||
# This backprop is particularly tricky, because we get back a different
|
# This backprop is particularly tricky, because we get back a different
|
||||||
|
|
|
@ -331,9 +331,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
tree = dict(tree)
|
tree = dict(tree)
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||||
|
|
||||||
trees.append(tree)
|
trees.append(tree)
|
||||||
|
|
||||||
|
|
|
@ -269,7 +269,10 @@ class SpanCategorizer(TrainablePipe):
|
||||||
DOCS: https://spacy.io/api/spancategorizer#predict
|
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||||
"""
|
"""
|
||||||
indices = self.suggester(docs, ops=self.model.ops)
|
indices = self.suggester(docs, ops=self.model.ops)
|
||||||
scores = self.model.predict((docs, indices)) # type: ignore
|
if indices.lengths.sum() == 0:
|
||||||
|
scores = self.model.ops.alloc2f(0, 0)
|
||||||
|
else:
|
||||||
|
scores = self.model.predict((docs, indices)) # type: ignore
|
||||||
return indices, scores
|
return indices, scores
|
||||||
|
|
||||||
def set_candidates(
|
def set_candidates(
|
||||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
||||||
|
|
||||||
# head before start
|
# head before start
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = -1
|
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
||||||
# head after end
|
# head after end
|
||||||
arr = doc.to_array(["HEAD"])
|
arr = doc.to_array(["HEAD"])
|
||||||
arr[0] = 5
|
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||||
doc_from_array = Doc(en_vocab, words=words)
|
doc_from_array = Doc(en_vocab, words=words)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc_from_array.from_array(["HEAD"], arr)
|
doc_from_array.from_array(["HEAD"], arr)
|
||||||
|
|
|
@ -3,6 +3,7 @@ import weakref
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||||
|
@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
# no warning using default attrs
|
# no warning using default attrs
|
||||||
attrs = doc._get_array_attrs()
|
attrs = doc._get_array_attrs()
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert len(record) == 0
|
|
||||||
# only SENT_START uses SENT_START
|
# only SENT_START uses SENT_START
|
||||||
attrs = [SENT_START]
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
|
||||||
|
@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST4", [doc2])
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
|
||||||
|
|
||||||
|
|
||||||
def test_attr_validation(en_vocab):
|
def test_attr_validation(en_vocab):
|
||||||
|
|
|
@ -60,10 +60,45 @@ def test_initialize_from_labels():
|
||||||
nlp2 = Language()
|
nlp2 = Language()
|
||||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||||
lemmatizer2.initialize(
|
lemmatizer2.initialize(
|
||||||
get_examples=lambda: train_examples,
|
# We want to check that the strings in replacement nodes are
|
||||||
|
# added to the string store. Avoid that they get added through
|
||||||
|
# the examples.
|
||||||
|
get_examples=lambda: train_examples[:1],
|
||||||
labels=lemmatizer.label_data,
|
labels=lemmatizer.label_data,
|
||||||
)
|
)
|
||||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||||
|
assert lemmatizer2.label_data == {
|
||||||
|
"trees": [
|
||||||
|
{"orig": "S", "subst": "s"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 0,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "s", "subst": ""},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 1,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "E", "subst": "e"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 5,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"labels": (1, 3, 4, 6),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_no_data():
|
def test_no_data():
|
||||||
|
|
|
@ -1048,6 +1048,10 @@ def test_no_gold_ents(patterns):
|
||||||
for eg in train_examples:
|
for eg in train_examples:
|
||||||
eg.predicted = ruler(eg.predicted)
|
eg.predicted = ruler(eg.predicted)
|
||||||
|
|
||||||
|
# Entity ruler is no longer needed (initialization below wipes out the
|
||||||
|
# patterns and causes warnings)
|
||||||
|
nlp.remove_pipe("entity_ruler")
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
||||||
|
|
||||||
|
|
||||||
def test_zero_suggestions():
|
def test_zero_suggestions():
|
||||||
# Test with a suggester that returns 0 suggestions
|
# Test with a suggester that can return 0 suggestions
|
||||||
|
|
||||||
@registry.misc("test_zero_suggester")
|
@registry.misc("test_mixed_zero_suggester")
|
||||||
def make_zero_suggester():
|
def make_mixed_zero_suggester():
|
||||||
def zero_suggester(docs, *, ops=None):
|
def mixed_zero_suggester(docs, *, ops=None):
|
||||||
if ops is None:
|
if ops is None:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
return Ragged(
|
spans = []
|
||||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
lengths = []
|
||||||
)
|
for doc in docs:
|
||||||
|
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||||
|
spans.append((0, 1))
|
||||||
|
lengths.append(1)
|
||||||
|
else:
|
||||||
|
lengths.append(0)
|
||||||
|
spans = ops.asarray2i(spans)
|
||||||
|
lengths_array = ops.asarray1i(lengths)
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
return output
|
||||||
|
|
||||||
return zero_suggester
|
return mixed_zero_suggester
|
||||||
|
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
"spancat",
|
"spancat",
|
||||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
config={
|
||||||
|
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
@ -397,6 +412,16 @@ def test_zero_suggestions():
|
||||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
nlp.update(train_examples, sgd=optimizer)
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
# empty doc
|
||||||
|
nlp("")
|
||||||
|
# single doc with zero suggestions
|
||||||
|
nlp("one")
|
||||||
|
# single doc with one suggestion
|
||||||
|
nlp("two two")
|
||||||
|
# batch with mixed zero/one suggestions
|
||||||
|
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||||
|
# batch with no suggestions
|
||||||
|
list(nlp.pipe(["", "one", "three three three"]))
|
||||||
|
|
||||||
|
|
||||||
def test_set_candidates():
|
def test_set_candidates():
|
||||||
|
|
|
@ -15,6 +15,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
||||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||||
from spacy.cli._util import substitute_project_variables
|
from spacy.cli._util import substitute_project_variables
|
||||||
from spacy.cli._util import validate_project_commands
|
from spacy.cli._util import validate_project_commands
|
||||||
|
from spacy.cli._util import upload_file, download_file
|
||||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||||
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
||||||
|
@ -855,3 +856,18 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
||||||
assert sum(span_freqs.values()) >= threshold
|
assert sum(span_freqs.values()) >= threshold
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_download_local_file():
|
||||||
|
with make_tempdir() as d1, make_tempdir() as d2:
|
||||||
|
filename = "f.txt"
|
||||||
|
content = "content"
|
||||||
|
local_file = d1 / filename
|
||||||
|
remote_file = d2 / filename
|
||||||
|
with local_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
upload_file(local_file, remote_file)
|
||||||
|
local_file.unlink()
|
||||||
|
download_file(remote_file, local_file)
|
||||||
|
with local_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
|
@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_displacy_parse_empty_spans_key(en_vocab):
|
||||||
|
"""Test that having an unset spans key doesn't raise an error"""
|
||||||
|
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
|
||||||
|
doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
|
||||||
|
with pytest.warns(UserWarning, match="W117"):
|
||||||
|
spans = displacy.parse_spans(doc)
|
||||||
|
|
||||||
|
assert isinstance(spans, dict)
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_parse_ents(en_vocab):
|
def test_displacy_parse_ents(en_vocab):
|
||||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec():
|
def make_test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vectors():
|
def vectors():
|
||||||
return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])]
|
return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
|
@ -71,19 +71,17 @@ def test_vectors_similarity_DD(vocab, vectors):
|
||||||
def test_vectors_similarity_TD(vocab, vectors):
|
def test_vectors_similarity_TD(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = Doc(vocab, words=[word1, word2])
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
with pytest.warns(UserWarning):
|
assert isinstance(doc.similarity(doc[0]), float)
|
||||||
assert isinstance(doc.similarity(doc[0]), float)
|
assert isinstance(doc[0].similarity(doc), float)
|
||||||
assert isinstance(doc[0].similarity(doc), float)
|
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_TS(vocab, vectors):
|
def test_vectors_similarity_TS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = Doc(vocab, words=[word1, word2])
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
with pytest.warns(UserWarning):
|
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_DS(vocab, vectors):
|
def test_vectors_similarity_DS(vocab, vectors):
|
||||||
|
|
|
@ -359,6 +359,7 @@ cdef class Doc:
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = annot[i]
|
attrs[i] = annot[i]
|
||||||
|
@ -1557,6 +1558,7 @@ cdef class Doc:
|
||||||
|
|
||||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||||
if attr is HEAD:
|
if attr is HEAD:
|
||||||
|
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
array[i, j] = annot[i]
|
array[i, j] = annot[i]
|
||||||
elif attr is MORPH:
|
elif attr is MORPH:
|
||||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.c.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||||
|
|
||||||
# if there is no appropriate ancestor, define a new artificial root
|
# if there is no appropriate ancestor, define a new artificial root
|
||||||
value = array[i, head_col]
|
value = array[i, head_col]
|
||||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
||||||
new_root = old_to_new_root.get(ancestor_i, None)
|
new_root = old_to_new_root.get(ancestor_i, None)
|
||||||
if new_root is not None:
|
if new_root is not None:
|
||||||
# take the same artificial root as a previous token from the same sentence
|
# take the same artificial root as a previous token from the same sentence
|
||||||
array[i, head_col] = new_root - i
|
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||||
else:
|
else:
|
||||||
# set this token as the new artificial root
|
# set this token as the new artificial root
|
||||||
array[i, head_col] = 0
|
array[i, head_col] = 0
|
||||||
|
|
|
@ -353,26 +353,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
continue
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([to_ternary_int(v) for v in value])
|
row = [to_ternary_int(v) for v in value]
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
row = [vocab.morphology.add(v) for v in value]
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
if not all(isinstance(v, str) for v in value):
|
if not all(isinstance(v, str) for v in value):
|
||||||
types = set([type(v) for v in value])
|
types = set([type(v) for v in value])
|
||||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
row = [vocab.strings.add(v) for v in value]
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||||
|
array = numpy.array(values, dtype=numpy.uint64)
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -337,3 +337,5 @@ def ensure_shape(vectors_loc):
|
||||||
# store all the results in a list in memory
|
# store all the results in a list in memory
|
||||||
lines2 = open_file(vectors_loc)
|
lines2 = open_file(vectors_loc)
|
||||||
yield from lines2
|
yield from lines2
|
||||||
|
lines2.close()
|
||||||
|
lines.close()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user