Merge branch 'master' into pr/6444

2025-08-08 22:24:55 +03:00 · 2020-12-09 11:09:40 +11:00 · 2020-12-09 11:09:40 +11:00 · 1980203229
commit 1980203229
parent 05a2812ae0 0afb54ac93
8 changed files with 123 additions and 100 deletions
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -2,112 +2,113 @@ trigger:
  batch: true
  branches:
    include:
-    - '*'
+      - "*"
    exclude:
-    - 'spacy.io'
+      - "spacy.io"
  paths:
    exclude:
-    - 'website/*'
-    - '*.md'
+      - "website/*"
+      - "*.md"
 pr:
  paths:
    exclude:
-    - 'website/*'
-    - '*.md'
+      - "website/*"
+      - "*.md"

 jobs:
+  # Perform basic checks for most important errors (syntax etc.) Uses the config
+  # defined in .flake8 and overwrites the selected codes.
+  - job: "Validate"
+    pool:
+      vmImage: "ubuntu-16.04"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "3.7"
+      - script: |
+          pip install flake8==3.5.0
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
+        displayName: "flake8"

-# Perform basic checks for most important errors (syntax etc.) Uses the config
-# defined in .flake8 and overwrites the selected codes.
- job: 'Validate'
-  pool:
-    vmImage: 'ubuntu-16.04'
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.7'
-  - script: |
-      pip install flake8==3.5.0
-      python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
-    displayName: 'flake8'
+  - job: "Test"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        Python36Linux:
+          imageName: "ubuntu-16.04"
+          python.version: "3.6"
+        Python36Windows:
+          imageName: "vs2017-win2016"
+          python.version: "3.6"
+        Python36Mac:
+          imageName: "macos-10.14"
+          python.version: "3.6"
+        # Don't test on 3.7 for now to speed up builds
+        # Python37Linux:
+        #   imageName: 'ubuntu-16.04'
+        #   python.version: '3.7'
+        # Python37Windows:
+        #   imageName: 'vs2017-win2016'
+        #   python.version: '3.7'
+        # Python37Mac:
+        #   imageName: 'macos-10.14'
+        #   python.version: '3.7'
+        Python38Linux:
+          imageName: "ubuntu-16.04"
+          python.version: "3.8"
+        Python38Windows:
+          imageName: "vs2017-win2016"
+          python.version: "3.8"
+        Python38Mac:
+          imageName: "macos-10.14"
+          python.version: "3.8"
+        Python39Linux:
+          imageName: "ubuntu-16.04"
+          python.version: "3.9"
+        Python39Windows:
+          imageName: "vs2017-win2016"
+          python.version: "3.9"
+        Python39Mac:
+          imageName: "macos-10.14"
+          python.version: "3.9"
+      maxParallel: 4
+    pool:
+      vmImage: $(imageName)

- job: 'Test'
-  dependsOn: 'Validate'
-  strategy:
-    matrix:
-      Python36Linux:
-        imageName: 'ubuntu-16.04'
-        python.version: '3.6'
-      Python36Windows:
-        imageName: 'vs2017-win2016'
-        python.version: '3.6'
-      Python36Mac:
-        imageName: 'macos-10.14'
-        python.version: '3.6'
-      # Don't test on 3.7 for now to speed up builds
-      # Python37Linux:
-      #   imageName: 'ubuntu-16.04'
-      #   python.version: '3.7'
-      # Python37Windows:
-      #   imageName: 'vs2017-win2016'
-      #   python.version: '3.7'
-      # Python37Mac:
-      #   imageName: 'macos-10.14'
-      #   python.version: '3.7'
-      Python38Linux:
-        imageName: 'ubuntu-16.04'
-        python.version: '3.8'
-      Python38Windows:
-        imageName: 'vs2017-win2016'
-        python.version: '3.8'
-      Python38Mac:
-        imageName: 'macos-10.14'
-        python.version: '3.8'
-      Python39Linux:
-        imageName: 'ubuntu-16.04'
-        python.version: '3.9'
-      Python39Windows:
-        imageName: 'vs2017-win2016'
-        python.version: '3.9'
-      Python39Mac:
-        imageName: 'macos-10.14'
-        python.version: '3.9'
-    maxParallel: 4
-  pool:
-    vmImage: $(imageName)
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "$(python.version)"
+          architecture: "x64"

-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '$(python.version)'
-      architecture: 'x64'
+      - script: |
+          python -m pip install -U pip setuptools
+          pip install -r requirements.txt
+        displayName: "Install dependencies"
+        condition: not(eq(variables['python.version'], '3.5'))

-  - script: |
-      python -m pip install -U pip setuptools
-      pip install -r requirements.txt
-    displayName: 'Install dependencies'
+      - script: |
+          python setup.py build_ext --inplace -j 2
+          python setup.py sdist --formats=gztar
+        displayName: "Compile and build sdist"

-  - script: |
-      python setup.py build_ext --inplace -j 2
-      python setup.py sdist --formats=gztar
-    displayName: 'Compile and build sdist'
+      - task: DeleteFiles@1
+        inputs:
+          contents: "spacy"
+        displayName: "Delete source directory"

-  - task: DeleteFiles@1
-    inputs:
-      contents: 'spacy'
-    displayName: 'Delete source directory'
+      - script: |
+          pip freeze > installed.txt
+          pip uninstall -y -r installed.txt
+        displayName: "Uninstall all packages"

-  - script: |
-      pip freeze > installed.txt
-      pip uninstall -y -r installed.txt
-    displayName: 'Uninstall all packages'
+      - bash: |
+          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+          pip install dist/$SDIST
+        displayName: "Install from sdist"
+        condition: not(eq(variables['python.version'], '3.5'))

-  - bash: |
-      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      pip install dist/$SDIST
-    displayName: 'Install from sdist'
-
-  - script: |
-      pip install -r requirements.txt
-      python -m pytest --pyargs spacy
-    displayName: 'Run tests'
+      - script: |
+          pip install -r requirements.txt
+          python -m pytest --pyargs spacy
+        displayName: "Run tests"
--- a/spacy/language.py
+++ b/spacy/language.py
@ -968,10 +968,6 @@ class Language:

        DOCS: https://nightly.spacy.io/api/language#call
        """
-        if len(text) > self.max_length:
-            raise ValueError(
-                Errors.E088.format(length=len(text), max_length=self.max_length)
-            )
        doc = self.make_doc(text)
        if component_cfg is None:
            component_cfg = {}
@ -1045,6 +1041,11 @@ class Language:
        text (str): The text to process.
        RETURNS (Doc): The processed doc.
        """
+        if len(text) > self.max_length:
+            raise ValueError(
+                Errors.E088.format(length=len(text), max_length=self.max_length)
+            )
+        return self.tokenizer(text)
        return self.tokenizer(text)

    def update(
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -261,7 +261,11 @@ class EntityRuler(Pipe):

        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
        try:
-            current_index = self.nlp.pipe_names.index(self.name)
+            current_index = -1
+            for i, (name, pipe) in enumerate(self.nlp.pipeline):
+                if self == pipe:
+                    current_index = i
+                    break
            subsequent_pipes = [
                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
            ]
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
    assert doc[1].is_stop
    assert not doc[0].is_stop
    assert not doc[1].like_num
+    # Test that norm is only set on tokens
+    doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
+    assert doc[0].norm_ == "eins"
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:1], attrs={"norm": "1"})
+    assert doc[0].norm_ == "1"
+    assert en_vocab["eins"].norm_ == "eins"


 def test_retokenize_skip_duplicates(en_vocab):
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
        ([{"IS_LEFT_PUNCT": True}], "``"),
        ([{"IS_RIGHT_PUNCT": True}], "''"),
        ([{"IS_STOP": True}], "the"),
+        ([{"SPACY": True}], "the"),
        ([{"LIKE_NUM": True}], "1"),
        ([{"LIKE_URL": True}], "http://example.com"),
        ([{"LIKE_EMAIL": True}], "mail@example.com"),
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -372,9 +372,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
                # Set attributes on both token and lexeme to take care of token
                # attribute vs. lexical attribute without having to enumerate
                # them. If an attribute name is not valid, set_struct_attr will
-                # ignore it.
+                # ignore it. Exception: set NORM only on tokens.
                Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
-                Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
+                if attr_name != NORM:
+                    Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
    # Assign correct dependencies to the inner token
    for i, head in enumerate(heads):
        doc.c[token_index + i].head = head
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1295,6 +1295,13 @@ def combine_score_weights(


 class DummyTokenizer:
+    def __call__(self, text):
+        raise NotImplementedError
+
+    def pipe(self, texts, **kwargs):
+        for text in texts:
+            yield self(text)
+
    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **kwargs):
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -169,6 +169,7 @@ rule-based matching are:
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                                                                                                                                                                                                     |
 |  `IS_SENT_START`                                | Token is start of sentence. ~~bool~~                                                                                                                                                                                                                                                                      |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                                                                                                                                                                                                       |
+| `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                                                                                                                                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                                                                                                                                                                                                         |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~                                                                                                                                                                                 |