mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
Merge branch 'master' into pr/6444
This commit is contained in:
commit
1980203229
|
@ -2,112 +2,113 @@ trigger:
|
||||||
batch: true
|
batch: true
|
||||||
branches:
|
branches:
|
||||||
include:
|
include:
|
||||||
- '*'
|
- "*"
|
||||||
exclude:
|
exclude:
|
||||||
- 'spacy.io'
|
- "spacy.io"
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- 'website/*'
|
- "website/*"
|
||||||
- '*.md'
|
- "*.md"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
|
- job: "Validate"
|
||||||
|
pool:
|
||||||
|
vmImage: "ubuntu-16.04"
|
||||||
|
steps:
|
||||||
|
- task: UsePythonVersion@0
|
||||||
|
inputs:
|
||||||
|
versionSpec: "3.7"
|
||||||
|
- script: |
|
||||||
|
pip install flake8==3.5.0
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
|
displayName: "flake8"
|
||||||
|
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
- job: "Test"
|
||||||
# defined in .flake8 and overwrites the selected codes.
|
dependsOn: "Validate"
|
||||||
- job: 'Validate'
|
strategy:
|
||||||
pool:
|
matrix:
|
||||||
vmImage: 'ubuntu-16.04'
|
Python36Linux:
|
||||||
steps:
|
imageName: "ubuntu-16.04"
|
||||||
- task: UsePythonVersion@0
|
python.version: "3.6"
|
||||||
inputs:
|
Python36Windows:
|
||||||
versionSpec: '3.7'
|
imageName: "vs2017-win2016"
|
||||||
- script: |
|
python.version: "3.6"
|
||||||
pip install flake8==3.5.0
|
Python36Mac:
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
imageName: "macos-10.14"
|
||||||
displayName: 'flake8'
|
python.version: "3.6"
|
||||||
|
# Don't test on 3.7 for now to speed up builds
|
||||||
|
# Python37Linux:
|
||||||
|
# imageName: 'ubuntu-16.04'
|
||||||
|
# python.version: '3.7'
|
||||||
|
# Python37Windows:
|
||||||
|
# imageName: 'vs2017-win2016'
|
||||||
|
# python.version: '3.7'
|
||||||
|
# Python37Mac:
|
||||||
|
# imageName: 'macos-10.14'
|
||||||
|
# python.version: '3.7'
|
||||||
|
Python38Linux:
|
||||||
|
imageName: "ubuntu-16.04"
|
||||||
|
python.version: "3.8"
|
||||||
|
Python38Windows:
|
||||||
|
imageName: "vs2017-win2016"
|
||||||
|
python.version: "3.8"
|
||||||
|
Python38Mac:
|
||||||
|
imageName: "macos-10.14"
|
||||||
|
python.version: "3.8"
|
||||||
|
Python39Linux:
|
||||||
|
imageName: "ubuntu-16.04"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Windows:
|
||||||
|
imageName: "vs2017-win2016"
|
||||||
|
python.version: "3.9"
|
||||||
|
Python39Mac:
|
||||||
|
imageName: "macos-10.14"
|
||||||
|
python.version: "3.9"
|
||||||
|
maxParallel: 4
|
||||||
|
pool:
|
||||||
|
vmImage: $(imageName)
|
||||||
|
|
||||||
- job: 'Test'
|
steps:
|
||||||
dependsOn: 'Validate'
|
- task: UsePythonVersion@0
|
||||||
strategy:
|
inputs:
|
||||||
matrix:
|
versionSpec: "$(python.version)"
|
||||||
Python36Linux:
|
architecture: "x64"
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.6'
|
|
||||||
Python36Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.6'
|
|
||||||
Python36Mac:
|
|
||||||
imageName: 'macos-10.14'
|
|
||||||
python.version: '3.6'
|
|
||||||
# Don't test on 3.7 for now to speed up builds
|
|
||||||
# Python37Linux:
|
|
||||||
# imageName: 'ubuntu-16.04'
|
|
||||||
# python.version: '3.7'
|
|
||||||
# Python37Windows:
|
|
||||||
# imageName: 'vs2017-win2016'
|
|
||||||
# python.version: '3.7'
|
|
||||||
# Python37Mac:
|
|
||||||
# imageName: 'macos-10.14'
|
|
||||||
# python.version: '3.7'
|
|
||||||
Python38Linux:
|
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.8'
|
|
||||||
Python38Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.8'
|
|
||||||
Python38Mac:
|
|
||||||
imageName: 'macos-10.14'
|
|
||||||
python.version: '3.8'
|
|
||||||
Python39Linux:
|
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.9'
|
|
||||||
Python39Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.9'
|
|
||||||
Python39Mac:
|
|
||||||
imageName: 'macos-10.14'
|
|
||||||
python.version: '3.9'
|
|
||||||
maxParallel: 4
|
|
||||||
pool:
|
|
||||||
vmImage: $(imageName)
|
|
||||||
|
|
||||||
steps:
|
- script: |
|
||||||
- task: UsePythonVersion@0
|
python -m pip install -U pip setuptools
|
||||||
inputs:
|
pip install -r requirements.txt
|
||||||
versionSpec: '$(python.version)'
|
displayName: "Install dependencies"
|
||||||
architecture: 'x64'
|
condition: not(eq(variables['python.version'], '3.5'))
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U pip setuptools
|
python setup.py build_ext --inplace -j 2
|
||||||
pip install -r requirements.txt
|
python setup.py sdist --formats=gztar
|
||||||
displayName: 'Install dependencies'
|
displayName: "Compile and build sdist"
|
||||||
|
|
||||||
- script: |
|
- task: DeleteFiles@1
|
||||||
python setup.py build_ext --inplace -j 2
|
inputs:
|
||||||
python setup.py sdist --formats=gztar
|
contents: "spacy"
|
||||||
displayName: 'Compile and build sdist'
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- script: |
|
||||||
inputs:
|
pip freeze > installed.txt
|
||||||
contents: 'spacy'
|
pip uninstall -y -r installed.txt
|
||||||
displayName: 'Delete source directory'
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- script: |
|
- bash: |
|
||||||
pip freeze > installed.txt
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
pip uninstall -y -r installed.txt
|
pip install dist/$SDIST
|
||||||
displayName: 'Uninstall all packages'
|
displayName: "Install from sdist"
|
||||||
|
condition: not(eq(variables['python.version'], '3.5'))
|
||||||
|
|
||||||
- bash: |
|
- script: |
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
pip install -r requirements.txt
|
||||||
pip install dist/$SDIST
|
python -m pytest --pyargs spacy
|
||||||
displayName: 'Install from sdist'
|
displayName: "Run tests"
|
||||||
|
|
||||||
- script: |
|
|
||||||
pip install -r requirements.txt
|
|
||||||
python -m pytest --pyargs spacy
|
|
||||||
displayName: 'Run tests'
|
|
||||||
|
|
|
@ -968,10 +968,6 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#call
|
DOCS: https://nightly.spacy.io/api/language#call
|
||||||
"""
|
"""
|
||||||
if len(text) > self.max_length:
|
|
||||||
raise ValueError(
|
|
||||||
Errors.E088.format(length=len(text), max_length=self.max_length)
|
|
||||||
)
|
|
||||||
doc = self.make_doc(text)
|
doc = self.make_doc(text)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
@ -1045,6 +1041,11 @@ class Language:
|
||||||
text (str): The text to process.
|
text (str): The text to process.
|
||||||
RETURNS (Doc): The processed doc.
|
RETURNS (Doc): The processed doc.
|
||||||
"""
|
"""
|
||||||
|
if len(text) > self.max_length:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E088.format(length=len(text), max_length=self.max_length)
|
||||||
|
)
|
||||||
|
return self.tokenizer(text)
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
|
|
|
@ -261,7 +261,11 @@ class EntityRuler(Pipe):
|
||||||
|
|
||||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
try:
|
try:
|
||||||
current_index = self.nlp.pipe_names.index(self.name)
|
current_index = -1
|
||||||
|
for i, (name, pipe) in enumerate(self.nlp.pipeline):
|
||||||
|
if self == pipe:
|
||||||
|
current_index = i
|
||||||
|
break
|
||||||
subsequent_pipes = [
|
subsequent_pipes = [
|
||||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
||||||
]
|
]
|
||||||
|
|
|
@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||||
assert doc[1].is_stop
|
assert doc[1].is_stop
|
||||||
assert not doc[0].is_stop
|
assert not doc[0].is_stop
|
||||||
assert not doc[1].like_num
|
assert not doc[1].like_num
|
||||||
|
# Test that norm is only set on tokens
|
||||||
|
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||||
|
assert doc[0].norm_ == "eins"
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:1], attrs={"norm": "1"})
|
||||||
|
assert doc[0].norm_ == "1"
|
||||||
|
assert en_vocab["eins"].norm_ == "eins"
|
||||||
|
|
||||||
|
|
||||||
def test_retokenize_skip_duplicates(en_vocab):
|
def test_retokenize_skip_duplicates(en_vocab):
|
||||||
|
|
|
@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
([{"IS_LEFT_PUNCT": True}], "``"),
|
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||||
([{"IS_STOP": True}], "the"),
|
([{"IS_STOP": True}], "the"),
|
||||||
|
([{"SPACY": True}], "the"),
|
||||||
([{"LIKE_NUM": True}], "1"),
|
([{"LIKE_NUM": True}], "1"),
|
||||||
([{"LIKE_URL": True}], "http://example.com"),
|
([{"LIKE_URL": True}], "http://example.com"),
|
||||||
([{"LIKE_EMAIL": True}], "mail@example.com"),
|
([{"LIKE_EMAIL": True}], "mail@example.com"),
|
||||||
|
|
|
@ -372,9 +372,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
# Set attributes on both token and lexeme to take care of token
|
# Set attributes on both token and lexeme to take care of token
|
||||||
# attribute vs. lexical attribute without having to enumerate
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
# them. If an attribute name is not valid, set_struct_attr will
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
# ignore it.
|
# ignore it. Exception: set NORM only on tokens.
|
||||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
if attr_name != NORM:
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||||
# Assign correct dependencies to the inner token
|
# Assign correct dependencies to the inner token
|
||||||
for i, head in enumerate(heads):
|
for i, head in enumerate(heads):
|
||||||
doc.c[token_index + i].head = head
|
doc.c[token_index + i].head = head
|
||||||
|
|
|
@ -1295,6 +1295,13 @@ def combine_score_weights(
|
||||||
|
|
||||||
|
|
||||||
class DummyTokenizer:
|
class DummyTokenizer:
|
||||||
|
def __call__(self, text):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def pipe(self, texts, **kwargs):
|
||||||
|
for text in texts:
|
||||||
|
yield self(text)
|
||||||
|
|
||||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
# allow serialization (see #1557)
|
# allow serialization (see #1557)
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
|
|
|
@ -169,6 +169,7 @@ rule-based matching are:
|
||||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||||
| `IS_SENT_START` | Token is start of sentence. ~~bool~~ |
|
| `IS_SENT_START` | Token is start of sentence. ~~bool~~ |
|
||||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||||
|
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
|
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user