Merge branch 'master' into pr/6444

This commit is contained in:
Ines Montani 2020-12-09 11:09:40 +11:00
commit 1980203229
8 changed files with 123 additions and 100 deletions

View File

@ -2,48 +2,47 @@ trigger:
batch: true batch: true
branches: branches:
include: include:
- '*' - "*"
exclude: exclude:
- 'spacy.io' - "spacy.io"
paths: paths:
exclude: exclude:
- 'website/*' - "website/*"
- '*.md' - "*.md"
pr: pr:
paths: paths:
exclude: exclude:
- 'website/*' - "website/*"
- '*.md' - "*.md"
jobs: jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# Perform basic checks for most important errors (syntax etc.) Uses the config # defined in .flake8 and overwrites the selected codes.
# defined in .flake8 and overwrites the selected codes. - job: "Validate"
- job: 'Validate'
pool: pool:
vmImage: 'ubuntu-16.04' vmImage: "ubuntu-16.04"
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
versionSpec: '3.7' versionSpec: "3.7"
- script: | - script: |
pip install flake8==3.5.0 pip install flake8==3.5.0
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: 'flake8' displayName: "flake8"
- job: 'Test' - job: "Test"
dependsOn: 'Validate' dependsOn: "Validate"
strategy: strategy:
matrix: matrix:
Python36Linux: Python36Linux:
imageName: 'ubuntu-16.04' imageName: "ubuntu-16.04"
python.version: '3.6' python.version: "3.6"
Python36Windows: Python36Windows:
imageName: 'vs2017-win2016' imageName: "vs2017-win2016"
python.version: '3.6' python.version: "3.6"
Python36Mac: Python36Mac:
imageName: 'macos-10.14' imageName: "macos-10.14"
python.version: '3.6' python.version: "3.6"
# Don't test on 3.7 for now to speed up builds # Don't test on 3.7 for now to speed up builds
# Python37Linux: # Python37Linux:
# imageName: 'ubuntu-16.04' # imageName: 'ubuntu-16.04'
@ -55,23 +54,23 @@ jobs:
# imageName: 'macos-10.14' # imageName: 'macos-10.14'
# python.version: '3.7' # python.version: '3.7'
Python38Linux: Python38Linux:
imageName: 'ubuntu-16.04' imageName: "ubuntu-16.04"
python.version: '3.8' python.version: "3.8"
Python38Windows: Python38Windows:
imageName: 'vs2017-win2016' imageName: "vs2017-win2016"
python.version: '3.8' python.version: "3.8"
Python38Mac: Python38Mac:
imageName: 'macos-10.14' imageName: "macos-10.14"
python.version: '3.8' python.version: "3.8"
Python39Linux: Python39Linux:
imageName: 'ubuntu-16.04' imageName: "ubuntu-16.04"
python.version: '3.9' python.version: "3.9"
Python39Windows: Python39Windows:
imageName: 'vs2017-win2016' imageName: "vs2017-win2016"
python.version: '3.9' python.version: "3.9"
Python39Mac: Python39Mac:
imageName: 'macos-10.14' imageName: "macos-10.14"
python.version: '3.9' python.version: "3.9"
maxParallel: 4 maxParallel: 4
pool: pool:
vmImage: $(imageName) vmImage: $(imageName)
@ -79,35 +78,37 @@ jobs:
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
versionSpec: '$(python.version)' versionSpec: "$(python.version)"
architecture: 'x64' architecture: "x64"
- script: | - script: |
python -m pip install -U pip setuptools python -m pip install -U pip setuptools
pip install -r requirements.txt pip install -r requirements.txt
displayName: 'Install dependencies' displayName: "Install dependencies"
condition: not(eq(variables['python.version'], '3.5'))
- script: | - script: |
python setup.py build_ext --inplace -j 2 python setup.py build_ext --inplace -j 2
python setup.py sdist --formats=gztar python setup.py sdist --formats=gztar
displayName: 'Compile and build sdist' displayName: "Compile and build sdist"
- task: DeleteFiles@1 - task: DeleteFiles@1
inputs: inputs:
contents: 'spacy' contents: "spacy"
displayName: 'Delete source directory' displayName: "Delete source directory"
- script: | - script: |
pip freeze > installed.txt pip freeze > installed.txt
pip uninstall -y -r installed.txt pip uninstall -y -r installed.txt
displayName: 'Uninstall all packages' displayName: "Uninstall all packages"
- bash: | - bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST pip install dist/$SDIST
displayName: 'Install from sdist' displayName: "Install from sdist"
condition: not(eq(variables['python.version'], '3.5'))
- script: | - script: |
pip install -r requirements.txt pip install -r requirements.txt
python -m pytest --pyargs spacy python -m pytest --pyargs spacy
displayName: 'Run tests' displayName: "Run tests"

View File

@ -968,10 +968,6 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#call DOCS: https://nightly.spacy.io/api/language#call
""" """
if len(text) > self.max_length:
raise ValueError(
Errors.E088.format(length=len(text), max_length=self.max_length)
)
doc = self.make_doc(text) doc = self.make_doc(text)
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
@ -1045,6 +1041,11 @@ class Language:
text (str): The text to process. text (str): The text to process.
RETURNS (Doc): The processed doc. RETURNS (Doc): The processed doc.
""" """
if len(text) > self.max_length:
raise ValueError(
Errors.E088.format(length=len(text), max_length=self.max_length)
)
return self.tokenizer(text)
return self.tokenizer(text) return self.tokenizer(text)
def update( def update(

View File

@ -261,7 +261,11 @@ class EntityRuler(Pipe):
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try: try:
current_index = self.nlp.pipe_names.index(self.name) current_index = -1
for i, (name, pipe) in enumerate(self.nlp.pipeline):
if self == pipe:
current_index = i
break
subsequent_pipes = [ subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :] pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
] ]

View File

@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
assert doc[1].is_stop assert doc[1].is_stop
assert not doc[0].is_stop assert not doc[0].is_stop
assert not doc[1].like_num assert not doc[1].like_num
# Test that norm is only set on tokens
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
assert doc[0].norm_ == "eins"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1], attrs={"norm": "1"})
assert doc[0].norm_ == "1"
assert en_vocab["eins"].norm_ == "eins"
def test_retokenize_skip_duplicates(en_vocab): def test_retokenize_skip_duplicates(en_vocab):

View File

@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
([{"IS_LEFT_PUNCT": True}], "``"), ([{"IS_LEFT_PUNCT": True}], "``"),
([{"IS_RIGHT_PUNCT": True}], "''"), ([{"IS_RIGHT_PUNCT": True}], "''"),
([{"IS_STOP": True}], "the"), ([{"IS_STOP": True}], "the"),
([{"SPACY": True}], "the"),
([{"LIKE_NUM": True}], "1"), ([{"LIKE_NUM": True}], "1"),
([{"LIKE_URL": True}], "http://example.com"), ([{"LIKE_URL": True}], "http://example.com"),
([{"LIKE_EMAIL": True}], "mail@example.com"), ([{"LIKE_EMAIL": True}], "mail@example.com"),

View File

@ -372,8 +372,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
# Set attributes on both token and lexeme to take care of token # Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate # attribute vs. lexical attribute without having to enumerate
# them. If an attribute name is not valid, set_struct_attr will # them. If an attribute name is not valid, set_struct_attr will
# ignore it. # ignore it. Exception: set NORM only on tokens.
Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
if attr_name != NORM:
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value)) Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
# Assign correct dependencies to the inner token # Assign correct dependencies to the inner token
for i, head in enumerate(heads): for i, head in enumerate(heads):

View File

@ -1295,6 +1295,13 @@ def combine_score_weights(
class DummyTokenizer: class DummyTokenizer:
def __call__(self, text):
raise NotImplementedError
def pipe(self, texts, **kwargs):
for text in texts:
yield self(text)
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557) # allow serialization (see #1557)
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):

View File

@ -169,6 +169,7 @@ rule-based matching are:
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `IS_SENT_START` | Token is start of sentence. ~~bool~~ | |  `IS_SENT_START` | Token is start of sentence. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
| `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |