Merge branch 'master' into pr/6444

2025-07-11 16:52:21 +03:00 · 2020-12-09 11:09:40 +11:00 · 2020-12-09 11:09:40 +11:00 · 1980203229
commit 1980203229
parent 05a2812ae0 0afb54ac93
8 changed files with 123 additions and 100 deletions
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -2,48 +2,47 @@ trigger:
  batch: true
  branches:
    include:
-    - '*'
+      - "*"
    exclude:
-    - 'spacy.io'
+      - "spacy.io"
  paths:
    exclude:
-    - 'website/*'
+      - "website/*"
-    - '*.md'
+      - "*.md"
 pr:
  paths:
    exclude:
-    - 'website/*'
+      - "website/*"
-    - '*.md'
+      - "*.md"
 jobs:
-
+  # Perform basic checks for most important errors (syntax etc.) Uses the config
-# Perform basic checks for most important errors (syntax etc.) Uses the config
+  # defined in .flake8 and overwrites the selected codes.
-# defined in .flake8 and overwrites the selected codes.
+  - job: "Validate"
 - job: 'Validate'
    pool:
-    vmImage: 'ubuntu-16.04'
+      vmImage: "ubuntu-16.04"
    steps:
      - task: UsePythonVersion@0
        inputs:
-      versionSpec: '3.7'
+          versionSpec: "3.7"
      - script: |
          pip install flake8==3.5.0
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
-    displayName: 'flake8'
+        displayName: "flake8"
- job: 'Test'
+  - job: "Test"
-  dependsOn: 'Validate'
+    dependsOn: "Validate"
    strategy:
      matrix:
        Python36Linux:
-        imageName: 'ubuntu-16.04'
+          imageName: "ubuntu-16.04"
-        python.version: '3.6'
+          python.version: "3.6"
        Python36Windows:
-        imageName: 'vs2017-win2016'
+          imageName: "vs2017-win2016"
-        python.version: '3.6'
+          python.version: "3.6"
        Python36Mac:
-        imageName: 'macos-10.14'
+          imageName: "macos-10.14"
-        python.version: '3.6'
+          python.version: "3.6"
        # Don't test on 3.7 for now to speed up builds
        # Python37Linux:
        #   imageName: 'ubuntu-16.04'
@ -55,23 +54,23 @@ jobs:
        #   imageName: 'macos-10.14'
        #   python.version: '3.7'
        Python38Linux:
-        imageName: 'ubuntu-16.04'
+          imageName: "ubuntu-16.04"
-        python.version: '3.8'
+          python.version: "3.8"
        Python38Windows:
-        imageName: 'vs2017-win2016'
+          imageName: "vs2017-win2016"
-        python.version: '3.8'
+          python.version: "3.8"
        Python38Mac:
-        imageName: 'macos-10.14'
+          imageName: "macos-10.14"
-        python.version: '3.8'
+          python.version: "3.8"
        Python39Linux:
-        imageName: 'ubuntu-16.04'
+          imageName: "ubuntu-16.04"
-        python.version: '3.9'
+          python.version: "3.9"
        Python39Windows:
-        imageName: 'vs2017-win2016'
+          imageName: "vs2017-win2016"
-        python.version: '3.9'
+          python.version: "3.9"
        Python39Mac:
-        imageName: 'macos-10.14'
+          imageName: "macos-10.14"
-        python.version: '3.9'
+          python.version: "3.9"
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -79,35 +78,37 @@ jobs:
    steps:
      - task: UsePythonVersion@0
        inputs:
-      versionSpec: '$(python.version)'
+          versionSpec: "$(python.version)"
-      architecture: 'x64'
+          architecture: "x64"
      - script: |
          python -m pip install -U pip setuptools
          pip install -r requirements.txt
-    displayName: 'Install dependencies'
+        displayName: "Install dependencies"
        condition: not(eq(variables['python.version'], '3.5'))
      - script: |
          python setup.py build_ext --inplace -j 2
          python setup.py sdist --formats=gztar
-    displayName: 'Compile and build sdist'
+        displayName: "Compile and build sdist"
      - task: DeleteFiles@1
        inputs:
-      contents: 'spacy'
+          contents: "spacy"
-    displayName: 'Delete source directory'
+        displayName: "Delete source directory"
      - script: |
          pip freeze > installed.txt
          pip uninstall -y -r installed.txt
-    displayName: 'Uninstall all packages'
+        displayName: "Uninstall all packages"
      - bash: |
          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
          pip install dist/$SDIST
-    displayName: 'Install from sdist'
+        displayName: "Install from sdist"
        condition: not(eq(variables['python.version'], '3.5'))
      - script: |
          pip install -r requirements.txt
          python -m pytest --pyargs spacy
-    displayName: 'Run tests'
+        displayName: "Run tests"
--- a/spacy/language.py
+++ b/spacy/language.py
@ -968,10 +968,6 @@ class Language:
        DOCS: https://nightly.spacy.io/api/language#call
        """
        if len(text) > self.max_length:
            raise ValueError(
                Errors.E088.format(length=len(text), max_length=self.max_length)
            )
        doc = self.make_doc(text)
        if component_cfg is None:
            component_cfg = {}
@ -1045,6 +1041,11 @@ class Language:
        text (str): The text to process.
        RETURNS (Doc): The processed doc.
        """
        if len(text) > self.max_length:
            raise ValueError(
                Errors.E088.format(length=len(text), max_length=self.max_length)
            )
        return self.tokenizer(text)
        return self.tokenizer(text)
    def update(
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -261,7 +261,11 @@ class EntityRuler(Pipe):
        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
        try:
-            current_index = self.nlp.pipe_names.index(self.name)
+            current_index = -1
            for i, (name, pipe) in enumerate(self.nlp.pipeline):
                if self == pipe:
                    current_index = i
                    break
            subsequent_pipes = [
                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
            ]
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
    assert doc[1].is_stop
    assert not doc[0].is_stop
    assert not doc[1].like_num
    # Test that norm is only set on tokens
    doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
    assert doc[0].norm_ == "eins"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:1], attrs={"norm": "1"})
    assert doc[0].norm_ == "1"
    assert en_vocab["eins"].norm_ == "eins"
 def test_retokenize_skip_duplicates(en_vocab):
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
        ([{"IS_LEFT_PUNCT": True}], "``"),
        ([{"IS_RIGHT_PUNCT": True}], "''"),
        ([{"IS_STOP": True}], "the"),
        ([{"SPACY": True}], "the"),
        ([{"LIKE_NUM": True}], "1"),
        ([{"LIKE_URL": True}], "http://example.com"),
        ([{"LIKE_EMAIL": True}], "mail@example.com"),
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -372,8 +372,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
                # Set attributes on both token and lexeme to take care of token
                # attribute vs. lexical attribute without having to enumerate
                # them. If an attribute name is not valid, set_struct_attr will
-                # ignore it.
+                # ignore it. Exception: set NORM only on tokens.
                Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
                if attr_name != NORM:
                    Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
    # Assign correct dependencies to the inner token
    for i, head in enumerate(heads):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1295,6 +1295,13 @@ def combine_score_weights(
 class DummyTokenizer:
    def __call__(self, text):
        raise NotImplementedError
    def pipe(self, texts, **kwargs):
        for text in texts:
            yield self(text)
    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **kwargs):
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -169,6 +169,7 @@ rule-based matching are:
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                                                                                                                                                                                                     |
 |  `IS_SENT_START`                                | Token is start of sentence. ~~bool~~                                                                                                                                                                                                                                                                      |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                                                                                                                                                                                                       |
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                                                                                                                                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                                                                                                                                                                                                         |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~                                                                                                                                                                                 |