Merge branch 'master' into pr/6444

This commit is contained in:
Ines Montani 2020-12-09 11:09:40 +11:00
commit 1980203229
8 changed files with 123 additions and 100 deletions

View File

@ -2,112 +2,113 @@ trigger:
batch: true
branches:
include:
- '*'
- "*"
exclude:
- 'spacy.io'
- "spacy.io"
paths:
exclude:
- 'website/*'
- '*.md'
- "website/*"
- "*.md"
pr:
paths:
exclude:
- 'website/*'
- '*.md'
- "website/*"
- "*.md"
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-16.04"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
- script: |
pip install flake8==3.5.0
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: "flake8"
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
- job: 'Validate'
pool:
vmImage: 'ubuntu-16.04'
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '3.7'
- script: |
pip install flake8==3.5.0
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: 'flake8'
- job: "Test"
dependsOn: "Validate"
strategy:
matrix:
Python36Linux:
imageName: "ubuntu-16.04"
python.version: "3.6"
Python36Windows:
imageName: "vs2017-win2016"
python.version: "3.6"
Python36Mac:
imageName: "macos-10.14"
python.version: "3.6"
# Don't test on 3.7 for now to speed up builds
# Python37Linux:
# imageName: 'ubuntu-16.04'
# python.version: '3.7'
# Python37Windows:
# imageName: 'vs2017-win2016'
# python.version: '3.7'
# Python37Mac:
# imageName: 'macos-10.14'
# python.version: '3.7'
Python38Linux:
imageName: "ubuntu-16.04"
python.version: "3.8"
Python38Windows:
imageName: "vs2017-win2016"
python.version: "3.8"
Python38Mac:
imageName: "macos-10.14"
python.version: "3.8"
Python39Linux:
imageName: "ubuntu-16.04"
python.version: "3.9"
Python39Windows:
imageName: "vs2017-win2016"
python.version: "3.9"
Python39Mac:
imageName: "macos-10.14"
python.version: "3.9"
maxParallel: 4
pool:
vmImage: $(imageName)
- job: 'Test'
dependsOn: 'Validate'
strategy:
matrix:
Python36Linux:
imageName: 'ubuntu-16.04'
python.version: '3.6'
Python36Windows:
imageName: 'vs2017-win2016'
python.version: '3.6'
Python36Mac:
imageName: 'macos-10.14'
python.version: '3.6'
# Don't test on 3.7 for now to speed up builds
# Python37Linux:
# imageName: 'ubuntu-16.04'
# python.version: '3.7'
# Python37Windows:
# imageName: 'vs2017-win2016'
# python.version: '3.7'
# Python37Mac:
# imageName: 'macos-10.14'
# python.version: '3.7'
Python38Linux:
imageName: 'ubuntu-16.04'
python.version: '3.8'
Python38Windows:
imageName: 'vs2017-win2016'
python.version: '3.8'
Python38Mac:
imageName: 'macos-10.14'
python.version: '3.8'
Python39Linux:
imageName: 'ubuntu-16.04'
python.version: '3.9'
Python39Windows:
imageName: 'vs2017-win2016'
python.version: '3.9'
Python39Mac:
imageName: 'macos-10.14'
python.version: '3.9'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "$(python.version)"
architecture: "x64"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
architecture: 'x64'
- script: |
python -m pip install -U pip setuptools
pip install -r requirements.txt
displayName: "Install dependencies"
condition: not(eq(variables['python.version'], '3.5'))
- script: |
python -m pip install -U pip setuptools
pip install -r requirements.txt
displayName: 'Install dependencies'
- script: |
python setup.py build_ext --inplace -j 2
python setup.py sdist --formats=gztar
displayName: "Compile and build sdist"
- script: |
python setup.py build_ext --inplace -j 2
python setup.py sdist --formats=gztar
displayName: 'Compile and build sdist'
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: 'spacy'
displayName: 'Delete source directory'
- script: |
pip freeze > installed.txt
pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- script: |
pip freeze > installed.txt
pip uninstall -y -r installed.txt
displayName: 'Uninstall all packages'
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST
displayName: "Install from sdist"
condition: not(eq(variables['python.version'], '3.5'))
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST
displayName: 'Install from sdist'
- script: |
pip install -r requirements.txt
python -m pytest --pyargs spacy
displayName: 'Run tests'
- script: |
pip install -r requirements.txt
python -m pytest --pyargs spacy
displayName: "Run tests"

View File

@ -968,10 +968,6 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#call
"""
if len(text) > self.max_length:
raise ValueError(
Errors.E088.format(length=len(text), max_length=self.max_length)
)
doc = self.make_doc(text)
if component_cfg is None:
component_cfg = {}
@ -1045,6 +1041,11 @@ class Language:
text (str): The text to process.
RETURNS (Doc): The processed doc.
"""
if len(text) > self.max_length:
raise ValueError(
Errors.E088.format(length=len(text), max_length=self.max_length)
)
return self.tokenizer(text)
return self.tokenizer(text)
def update(

View File

@ -261,7 +261,11 @@ class EntityRuler(Pipe):
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try:
current_index = self.nlp.pipe_names.index(self.name)
current_index = -1
for i, (name, pipe) in enumerate(self.nlp.pipeline):
if self == pipe:
current_index = i
break
subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
]

View File

@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
assert doc[1].is_stop
assert not doc[0].is_stop
assert not doc[1].like_num
# Test that norm is only set on tokens
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
assert doc[0].norm_ == "eins"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1], attrs={"norm": "1"})
assert doc[0].norm_ == "1"
assert en_vocab["eins"].norm_ == "eins"
def test_retokenize_skip_duplicates(en_vocab):

View File

@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
([{"IS_LEFT_PUNCT": True}], "``"),
([{"IS_RIGHT_PUNCT": True}], "''"),
([{"IS_STOP": True}], "the"),
([{"SPACY": True}], "the"),
([{"LIKE_NUM": True}], "1"),
([{"LIKE_URL": True}], "http://example.com"),
([{"LIKE_EMAIL": True}], "mail@example.com"),

View File

@ -372,9 +372,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate
# them. If an attribute name is not valid, set_struct_attr will
# ignore it.
# ignore it. Exception: set NORM only on tokens.
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
if attr_name != NORM:
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
# Assign correct dependencies to the inner token
for i, head in enumerate(heads):
doc.c[token_index + i].head = head

View File

@ -1295,6 +1295,13 @@ def combine_score_weights(
class DummyTokenizer:
def __call__(self, text):
raise NotImplementedError
def pipe(self, texts, **kwargs):
for text in texts:
yield self(text)
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **kwargs):

View File

@ -169,6 +169,7 @@ rule-based matching are:
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `IS_SENT_START` | Token is start of sentence. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
| `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |