mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'master' into pr/6444
This commit is contained in:
commit
1980203229
|
@ -2,112 +2,113 @@ trigger:
|
|||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- '*'
|
||||
- "*"
|
||||
exclude:
|
||||
- 'spacy.io'
|
||||
- "spacy.io"
|
||||
paths:
|
||||
exclude:
|
||||
- 'website/*'
|
||||
- '*.md'
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- 'website/*'
|
||||
- '*.md'
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-16.04"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install flake8==3.5.0
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
- job: 'Validate'
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '3.7'
|
||||
- script: |
|
||||
pip install flake8==3.5.0
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: 'flake8'
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
strategy:
|
||||
matrix:
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-16.04"
|
||||
python.version: "3.6"
|
||||
Python36Windows:
|
||||
imageName: "vs2017-win2016"
|
||||
python.version: "3.6"
|
||||
Python36Mac:
|
||||
imageName: "macos-10.14"
|
||||
python.version: "3.6"
|
||||
# Don't test on 3.7 for now to speed up builds
|
||||
# Python37Linux:
|
||||
# imageName: 'ubuntu-16.04'
|
||||
# python.version: '3.7'
|
||||
# Python37Windows:
|
||||
# imageName: 'vs2017-win2016'
|
||||
# python.version: '3.7'
|
||||
# Python37Mac:
|
||||
# imageName: 'macos-10.14'
|
||||
# python.version: '3.7'
|
||||
Python38Linux:
|
||||
imageName: "ubuntu-16.04"
|
||||
python.version: "3.8"
|
||||
Python38Windows:
|
||||
imageName: "vs2017-win2016"
|
||||
python.version: "3.8"
|
||||
Python38Mac:
|
||||
imageName: "macos-10.14"
|
||||
python.version: "3.8"
|
||||
Python39Linux:
|
||||
imageName: "ubuntu-16.04"
|
||||
python.version: "3.9"
|
||||
Python39Windows:
|
||||
imageName: "vs2017-win2016"
|
||||
python.version: "3.9"
|
||||
Python39Mac:
|
||||
imageName: "macos-10.14"
|
||||
python.version: "3.9"
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
||||
- job: 'Test'
|
||||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
Python36Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.6'
|
||||
Python36Mac:
|
||||
imageName: 'macos-10.14'
|
||||
python.version: '3.6'
|
||||
# Don't test on 3.7 for now to speed up builds
|
||||
# Python37Linux:
|
||||
# imageName: 'ubuntu-16.04'
|
||||
# python.version: '3.7'
|
||||
# Python37Windows:
|
||||
# imageName: 'vs2017-win2016'
|
||||
# python.version: '3.7'
|
||||
# Python37Mac:
|
||||
# imageName: 'macos-10.14'
|
||||
# python.version: '3.7'
|
||||
Python38Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.8'
|
||||
Python38Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.8'
|
||||
Python38Mac:
|
||||
imageName: 'macos-10.14'
|
||||
python.version: '3.8'
|
||||
Python39Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.9'
|
||||
Python39Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.9'
|
||||
Python39Mac:
|
||||
imageName: 'macos-10.14'
|
||||
python.version: '3.9'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "$(python.version)"
|
||||
architecture: "x64"
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
- script: |
|
||||
python -m pip install -U pip setuptools
|
||||
pip install -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
condition: not(eq(variables['python.version'], '3.5'))
|
||||
|
||||
- script: |
|
||||
python -m pip install -U pip setuptools
|
||||
pip install -r requirements.txt
|
||||
displayName: 'Install dependencies'
|
||||
- script: |
|
||||
python setup.py build_ext --inplace -j 2
|
||||
python setup.py sdist --formats=gztar
|
||||
displayName: "Compile and build sdist"
|
||||
|
||||
- script: |
|
||||
python setup.py build_ext --inplace -j 2
|
||||
python setup.py sdist --formats=gztar
|
||||
displayName: 'Compile and build sdist'
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: 'spacy'
|
||||
displayName: 'Delete source directory'
|
||||
- script: |
|
||||
pip freeze > installed.txt
|
||||
pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- script: |
|
||||
pip freeze > installed.txt
|
||||
pip uninstall -y -r installed.txt
|
||||
displayName: 'Uninstall all packages'
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
condition: not(eq(variables['python.version'], '3.5'))
|
||||
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
pip install dist/$SDIST
|
||||
displayName: 'Install from sdist'
|
||||
|
||||
- script: |
|
||||
pip install -r requirements.txt
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: 'Run tests'
|
||||
- script: |
|
||||
pip install -r requirements.txt
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run tests"
|
||||
|
|
|
@ -968,10 +968,6 @@ class Language:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/language#call
|
||||
"""
|
||||
if len(text) > self.max_length:
|
||||
raise ValueError(
|
||||
Errors.E088.format(length=len(text), max_length=self.max_length)
|
||||
)
|
||||
doc = self.make_doc(text)
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
|
@ -1045,6 +1041,11 @@ class Language:
|
|||
text (str): The text to process.
|
||||
RETURNS (Doc): The processed doc.
|
||||
"""
|
||||
if len(text) > self.max_length:
|
||||
raise ValueError(
|
||||
Errors.E088.format(length=len(text), max_length=self.max_length)
|
||||
)
|
||||
return self.tokenizer(text)
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(
|
||||
|
|
|
@ -261,7 +261,11 @@ class EntityRuler(Pipe):
|
|||
|
||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
try:
|
||||
current_index = self.nlp.pipe_names.index(self.name)
|
||||
current_index = -1
|
||||
for i, (name, pipe) in enumerate(self.nlp.pipeline):
|
||||
if self == pipe:
|
||||
current_index = i
|
||||
break
|
||||
subsequent_pipes = [
|
||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
||||
]
|
||||
|
|
|
@ -416,6 +416,13 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
|||
assert doc[1].is_stop
|
||||
assert not doc[0].is_stop
|
||||
assert not doc[1].like_num
|
||||
# Test that norm is only set on tokens
|
||||
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||
assert doc[0].norm_ == "eins"
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:1], attrs={"norm": "1"})
|
||||
assert doc[0].norm_ == "1"
|
||||
assert en_vocab["eins"].norm_ == "eins"
|
||||
|
||||
|
||||
def test_retokenize_skip_duplicates(en_vocab):
|
||||
|
|
|
@ -457,6 +457,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||
([{"IS_STOP": True}], "the"),
|
||||
([{"SPACY": True}], "the"),
|
||||
([{"LIKE_NUM": True}], "1"),
|
||||
([{"LIKE_URL": True}], "http://example.com"),
|
||||
([{"LIKE_EMAIL": True}], "mail@example.com"),
|
||||
|
|
|
@ -372,9 +372,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
# them. If an attribute name is not valid, set_struct_attr will
|
||||
# ignore it.
|
||||
# ignore it. Exception: set NORM only on tokens.
|
||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||
if attr_name != NORM:
|
||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||
# Assign correct dependencies to the inner token
|
||||
for i, head in enumerate(heads):
|
||||
doc.c[token_index + i].head = head
|
||||
|
|
|
@ -1295,6 +1295,13 @@ def combine_score_weights(
|
|||
|
||||
|
||||
class DummyTokenizer:
|
||||
def __call__(self, text):
|
||||
raise NotImplementedError
|
||||
|
||||
def pipe(self, texts, **kwargs):
|
||||
for text in texts:
|
||||
yield self(text)
|
||||
|
||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||
# allow serialization (see #1557)
|
||||
def to_bytes(self, **kwargs):
|
||||
|
|
|
@ -169,6 +169,7 @@ rule-based matching are:
|
|||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||
| `IS_SENT_START` | Token is start of sentence. ~~bool~~ |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user