diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a1b9776e9..2d8caae5c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -2,112 +2,113 @@ trigger: batch: true branches: include: - - '*' + - "*" exclude: - - 'spacy.io' + - "spacy.io" paths: exclude: - - 'website/*' - - '*.md' + - "website/*" + - "*.md" pr: paths: exclude: - - 'website/*' - - '*.md' + - "website/*" + - "*.md" jobs: + # Perform basic checks for most important errors (syntax etc.) Uses the config + # defined in .flake8 and overwrites the selected codes. + - job: "Validate" + pool: + vmImage: "ubuntu-16.04" + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "3.7" + - script: | + pip install flake8==3.5.0 + python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics + displayName: "flake8" -# Perform basic checks for most important errors (syntax etc.) Uses the config -# defined in .flake8 and overwrites the selected codes. -- job: 'Validate' - pool: - vmImage: 'ubuntu-16.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.7' - - script: | - pip install flake8==3.5.0 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics - displayName: 'flake8' + - job: "Test" + dependsOn: "Validate" + strategy: + matrix: + Python36Linux: + imageName: "ubuntu-16.04" + python.version: "3.6" + Python36Windows: + imageName: "vs2017-win2016" + python.version: "3.6" + Python36Mac: + imageName: "macos-10.14" + python.version: "3.6" + # Don't test on 3.7 for now to speed up builds + # Python37Linux: + # imageName: 'ubuntu-16.04' + # python.version: '3.7' + # Python37Windows: + # imageName: 'vs2017-win2016' + # python.version: '3.7' + # Python37Mac: + # imageName: 'macos-10.14' + # python.version: '3.7' + Python38Linux: + imageName: "ubuntu-16.04" + python.version: "3.8" + Python38Windows: + imageName: "vs2017-win2016" + python.version: "3.8" + Python38Mac: + imageName: "macos-10.14" + python.version: "3.8" + Python39Linux: + imageName: "ubuntu-16.04" + python.version: "3.9" + Python39Windows: + imageName: "vs2017-win2016" + python.version: "3.9" + Python39Mac: + imageName: "macos-10.14" + python.version: "3.9" + maxParallel: 4 + pool: + vmImage: $(imageName) -- job: 'Test' - dependsOn: 'Validate' - strategy: - matrix: - Python36Linux: - imageName: 'ubuntu-16.04' - python.version: '3.6' - Python36Windows: - imageName: 'vs2017-win2016' - python.version: '3.6' - Python36Mac: - imageName: 'macos-10.14' - python.version: '3.6' - # Don't test on 3.7 for now to speed up builds - # Python37Linux: - # imageName: 'ubuntu-16.04' - # python.version: '3.7' - # Python37Windows: - # imageName: 'vs2017-win2016' - # python.version: '3.7' - # Python37Mac: - # imageName: 'macos-10.14' - # python.version: '3.7' - Python38Linux: - imageName: 'ubuntu-16.04' - python.version: '3.8' - Python38Windows: - imageName: 'vs2017-win2016' - python.version: '3.8' - Python38Mac: - imageName: 'macos-10.14' - python.version: '3.8' - Python39Linux: - imageName: 'ubuntu-16.04' - python.version: '3.9' - Python39Windows: - imageName: 'vs2017-win2016' - python.version: '3.9' - Python39Mac: - imageName: 'macos-10.14' - python.version: '3.9' - maxParallel: 4 - pool: - vmImage: $(imageName) + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "$(python.version)" + architecture: "x64" - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - architecture: 'x64' + - script: | + python -m pip install -U pip setuptools + pip install -r requirements.txt + displayName: "Install dependencies" + condition: not(eq(variables['python.version'], '3.5')) - - script: | - python -m pip install -U pip setuptools - pip install -r requirements.txt - displayName: 'Install dependencies' + - script: | + python setup.py build_ext --inplace -j 2 + python setup.py sdist --formats=gztar + displayName: "Compile and build sdist" - - script: | - python setup.py build_ext --inplace -j 2 - python setup.py sdist --formats=gztar - displayName: 'Compile and build sdist' + - task: DeleteFiles@1 + inputs: + contents: "spacy" + displayName: "Delete source directory" - - task: DeleteFiles@1 - inputs: - contents: 'spacy' - displayName: 'Delete source directory' + - script: | + pip freeze > installed.txt + pip uninstall -y -r installed.txt + displayName: "Uninstall all packages" - - script: | - pip freeze > installed.txt - pip uninstall -y -r installed.txt - displayName: 'Uninstall all packages' + - bash: | + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + pip install dist/$SDIST + displayName: "Install from sdist" + condition: not(eq(variables['python.version'], '3.5')) - - bash: | - SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - pip install dist/$SDIST - displayName: 'Install from sdist' - - - script: | - pip install -r requirements.txt - python -m pytest --pyargs spacy - displayName: 'Run tests' + - script: | + pip install -r requirements.txt + python -m pytest --pyargs spacy + displayName: "Run tests" diff --git a/spacy/language.py b/spacy/language.py index dd790e85f..7530fa5df 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -968,10 +968,6 @@ class Language: DOCS: https://nightly.spacy.io/api/language#call """ - if len(text) > self.max_length: - raise ValueError( - Errors.E088.format(length=len(text), max_length=self.max_length) - ) doc = self.make_doc(text) if component_cfg is None: component_cfg = {} @@ -1045,6 +1041,11 @@ class Language: text (str): The text to process. RETURNS (Doc): The processed doc. """ + if len(text) > self.max_length: + raise ValueError( + Errors.E088.format(length=len(text), max_length=self.max_length) + ) + return self.tokenizer(text) return self.tokenizer(text) def update( diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2a3b8dd00..c3d983dec 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -261,7 +261,11 @@ class EntityRuler(Pipe): # disable the nlp components after this one in case they hadn't been initialized / deserialised yet try: - current_index = self.nlp.pipe_names.index(self.name) + current_index = -1 + for i, (name, pipe) in enumerate(self.nlp.pipeline): + if self == pipe: + current_index = i + break subsequent_pipes = [ pipe for pipe in self.nlp.pipe_names[current_index + 1 :] ] diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index b483255c8..14e4bc44b 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab): assert doc[1].is_stop assert not doc[0].is_stop assert not doc[1].like_num + # Test that norm is only set on tokens + doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"]) + assert doc[0].norm_ == "eins" + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:1], attrs={"norm": "1"}) + assert doc[0].norm_ == "1" + assert en_vocab["eins"].norm_ == "eins" def test_retokenize_skip_duplicates(en_vocab): diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 77b09f376..91f843a93 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab): ([{"IS_LEFT_PUNCT": True}], "``"), ([{"IS_RIGHT_PUNCT": True}], "''"), ([{"IS_STOP": True}], "the"), + ([{"SPACY": True}], "the"), ([{"LIKE_NUM": True}], "1"), ([{"LIKE_URL": True}], "http://example.com"), ([{"LIKE_EMAIL": True}], "mail@example.com"), diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 398dfca26..ea65c464a 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -372,9 +372,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs): # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate # them. If an attribute name is not valid, set_struct_attr will - # ignore it. + # ignore it. Exception: set NORM only on tokens. Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) - Lexeme.set_struct_attr(token.lex, attr_name, get_string_id(attr_value)) + if attr_name != NORM: + Lexeme.set_struct_attr(token.lex, attr_name, get_string_id(attr_value)) # Assign correct dependencies to the inner token for i, head in enumerate(heads): doc.c[token_index + i].head = head diff --git a/spacy/util.py b/spacy/util.py index 60723b85e..981ac426a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1295,6 +1295,13 @@ def combine_score_weights( class DummyTokenizer: + def __call__(self, text): + raise NotImplementedError + + def pipe(self, texts, **kwargs): + for text in texts: + yield self(text) + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) def to_bytes(self, **kwargs): diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 32c6de5f9..22bf4f470 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -169,6 +169,7 @@ rule-based matching are: |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | |  `IS_SENT_START` | Token is start of sentence. ~~bool~~ | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |