mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master-v3.1-3
This commit is contained in:
commit
d98d525bc8
5
.github/ISSUE_TEMPLATE/config.yml
vendored
5
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,8 +1,11 @@
|
||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
contact_links:
|
||||||
|
- name: ⚠️ Python 3.10 Support
|
||||||
|
url: https://github.com/explosion/spaCy/discussions/9418
|
||||||
|
about: Python 3.10 wheels haven't been released yet, see the link for details.
|
||||||
- name: 🗯 Discussions Forum
|
- name: 🗯 Discussions Forum
|
||||||
url: https://github.com/explosion/spaCy/discussions
|
url: https://github.com/explosion/spaCy/discussions
|
||||||
about: Usage questions, general discussion and anything else that isn't a bug report.
|
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
||||||
- name: 📖 spaCy FAQ & Troubleshooting
|
- name: 📖 spaCy FAQ & Troubleshooting
|
||||||
url: https://github.com/explosion/spaCy/discussions/8226
|
url: https://github.com/explosion/spaCy/discussions/8226
|
||||||
about: Before you post, check out the FAQ for answers to common community questions!
|
about: Before you post, check out the FAQ for answers to common community questions!
|
||||||
|
|
5
.github/azure-steps.yml
vendored
5
.github/azure-steps.yml
vendored
|
@ -100,3 +100,8 @@ steps:
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
displayName: 'Test website/meta/universe.json'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
19
.github/validate_universe_json.py
vendored
Normal file
19
.github/validate_universe_json.py
vendored
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def validate_json(document):
|
||||||
|
universe_file = Path(document)
|
||||||
|
with universe_file.open() as f:
|
||||||
|
universe_data = json.load(f)
|
||||||
|
for entry in universe_data["resources"]:
|
||||||
|
if "github" in entry:
|
||||||
|
assert not re.match(
|
||||||
|
r"^(http:)|^(https:)", entry["github"]
|
||||||
|
), "Github field should be user/repo, not a url"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
validate_json(str(sys.argv[1]))
|
27
.github/workflows/explosionbot.yml
vendored
Normal file
27
.github/workflows/explosionbot.yml
vendored
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
name: Explosion Bot
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types:
|
||||||
|
- created
|
||||||
|
- edited
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
explosion-bot:
|
||||||
|
runs-on: ubuntu-18.04
|
||||||
|
steps:
|
||||||
|
- name: Dump GitHub context
|
||||||
|
env:
|
||||||
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
|
run: echo "$GITHUB_CONTEXT"
|
||||||
|
- uses: actions/checkout@v1
|
||||||
|
- uses: actions/setup-python@v1
|
||||||
|
- name: Install and run explosion-bot
|
||||||
|
run: |
|
||||||
|
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||||
|
python -m explosionbot
|
||||||
|
env:
|
||||||
|
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
|
||||||
|
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
||||||
|
ENABLED_COMMANDS: "test_gpu"
|
||||||
|
ALLOWED_TEAMS: "spaCy"
|
8
CITATION
8
CITATION
|
@ -1,8 +0,0 @@
|
||||||
@software{spacy,
|
|
||||||
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
|
|
||||||
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
|
|
||||||
year = 2020,
|
|
||||||
publisher = {Zenodo},
|
|
||||||
doi = {10.5281/zenodo.1212303},
|
|
||||||
url = {https://doi.org/10.5281/zenodo.1212303}
|
|
||||||
}
|
|
16
CITATION.cff
Normal file
16
CITATION.cff
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
cff-version: 1.2.0
|
||||||
|
preferred-citation:
|
||||||
|
type: article
|
||||||
|
message: "If you use spaCy, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- family-names: "Honnibal"
|
||||||
|
given-names: "Matthew"
|
||||||
|
- family-names: "Montani"
|
||||||
|
given-names: "Ines"
|
||||||
|
- family-names: "Van Landeghem"
|
||||||
|
given-names: "Sofie"
|
||||||
|
- family-names: "Boyd"
|
||||||
|
given-names: "Adriane"
|
||||||
|
title: "spaCy: Industrial-strength Natural Language Processing in Python"
|
||||||
|
doi: "10.5281/zenodo.1212303"
|
||||||
|
year: 2020
|
|
@ -1,5 +1,5 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
|
|
|
@ -16,6 +16,8 @@ pr:
|
||||||
exclude:
|
exclude:
|
||||||
- "website/*"
|
- "website/*"
|
||||||
- "*.md"
|
- "*.md"
|
||||||
|
include:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
|
|
@ -124,7 +124,8 @@ exclude =
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
markers =
|
markers =
|
||||||
slow
|
slow: mark a test as slow
|
||||||
|
issue: reference specific issue
|
||||||
|
|
||||||
[mypy]
|
[mypy]
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Dict, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import typer
|
import typer
|
||||||
|
@ -7,7 +7,7 @@ import sys
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu
|
||||||
from ..training.loop import train
|
from ..training.loop import train as train_nlp
|
||||||
from ..training.initialize import init_nlp
|
from ..training.initialize import init_nlp
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -40,6 +40,18 @@ def train_cli(
|
||||||
DOCS: https://spacy.io/api/cli#train
|
DOCS: https://spacy.io/api/cli#train
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
import_code(code_path)
|
||||||
|
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def train(
|
||||||
|
config_path: Path,
|
||||||
|
output_path: Optional[Path] = None,
|
||||||
|
*,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
||||||
|
):
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
@ -50,8 +62,6 @@ def train_cli(
|
||||||
output_path.mkdir(parents=True)
|
output_path.mkdir(parents=True)
|
||||||
msg.good(f"Created output directory: {output_path}")
|
msg.good(f"Created output directory: {output_path}")
|
||||||
msg.info(f"Saving to output directory: {output_path}")
|
msg.info(f"Saving to output directory: {output_path}")
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
import_code(code_path)
|
|
||||||
setup_gpu(use_gpu)
|
setup_gpu(use_gpu)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||||
|
@ -60,4 +70,4 @@ def train_cli(
|
||||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||||
msg.good("Initialized pipeline")
|
msg.good("Initialized pipeline")
|
||||||
msg.divider("Training pipeline")
|
msg.divider("Training pipeline")
|
||||||
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|
||||||
|
|
|
@ -25,7 +25,7 @@ def setup_default_warnings():
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
filter_warning("once", error_msg="[W108]")
|
filter_warning("once", error_msg=Warnings.W108)
|
||||||
|
|
||||||
|
|
||||||
def filter_warning(action: str, error_msg: str):
|
def filter_warning(action: str, error_msg: str):
|
||||||
|
@ -170,8 +170,8 @@ class Warnings:
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
|
||||||
"token '{text}'. Check that your pipeline includes components that "
|
"more tokens. Check that your pipeline includes components that "
|
||||||
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||||
"'morphologizer'.")
|
"'morphologizer'.")
|
||||||
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
||||||
|
@ -658,7 +658,9 @@ class Errors:
|
||||||
"{nO} - cannot add any more labels.")
|
"{nO} - cannot add any more labels.")
|
||||||
E923 = ("It looks like there is no proper sample data to initialize the "
|
E923 = ("It looks like there is no proper sample data to initialize the "
|
||||||
"Model of component '{name}'. To check your input data paths and "
|
"Model of component '{name}'. To check your input data paths and "
|
||||||
"annotation, run: python -m spacy debug data config.cfg")
|
"annotation, run: python -m spacy debug data config.cfg "
|
||||||
|
"and include the same config override values you would specify "
|
||||||
|
"for the 'spacy train' command.")
|
||||||
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
E924 = ("The '{name}' component does not seem to be initialized properly. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue: "
|
"This is likely a bug in spaCy, so feel free to open an issue: "
|
||||||
"https://github.com/explosion/spaCy/issues")
|
"https://github.com/explosion/spaCy/issues")
|
||||||
|
@ -793,7 +795,7 @@ class Errors:
|
||||||
"to token boundaries.")
|
"to token boundaries.")
|
||||||
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
|
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
|
||||||
"{keys}")
|
"{keys}")
|
||||||
E984 = ("Invalid component config for '{name}': component block needs either "
|
E984 = ("Invalid component config for '{name}': component block needs either "
|
||||||
"a key `factory` specifying the registered function used to "
|
"a key `factory` specifying the registered function used to "
|
||||||
|
|
|
@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
|
||||||
return forms
|
return forms
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
forms = list(set(oov_forms))
|
forms = list(dict.fromkeys(oov_forms))
|
||||||
# Back-off through remaining return value candidates.
|
# Back-off through remaining return value candidates.
|
||||||
if forms:
|
if forms:
|
||||||
for form in forms:
|
for form in forms:
|
||||||
|
|
|
@ -58,7 +58,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||||
features_to_compare = ["Case", "Number", "Gender"]
|
features_to_compare = ["Case", "Number", "Gender"]
|
||||||
elif univ_pos == "NUM":
|
elif univ_pos == "NUM":
|
||||||
|
@ -89,7 +89,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -707,8 +707,9 @@ class Language:
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
pipe_config = util.copy_config(source_config["components"][source_name])
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
for s in source.vocab.strings:
|
if self.vocab.strings != source.vocab.strings:
|
||||||
self.vocab.strings.add(s)
|
for s in source.vocab.strings:
|
||||||
|
self.vocab.strings.add(s)
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
|
@ -1379,6 +1380,9 @@ class Language:
|
||||||
scorer = Scorer(**kwargs)
|
scorer = Scorer(**kwargs)
|
||||||
# reset annotation in predicted docs and time tokenization
|
# reset annotation in predicted docs and time tokenization
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
|
# this is purely for timing
|
||||||
|
for eg in examples:
|
||||||
|
self.make_doc(eg.reference.text)
|
||||||
# apply all pipeline components
|
# apply all pipeline components
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
|
@ -1708,6 +1712,7 @@ class Language:
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
source_nlp_vectors_hashes = {}
|
source_nlp_vectors_hashes = {}
|
||||||
|
vocab_b = None
|
||||||
for pipe_name in config["nlp"]["pipeline"]:
|
for pipe_name in config["nlp"]["pipeline"]:
|
||||||
if pipe_name not in pipeline:
|
if pipe_name not in pipeline:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
|
@ -1730,14 +1735,22 @@ class Language:
|
||||||
raw_config=raw_config,
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# We need the sourced components to reference the same
|
||||||
|
# vocab without modifying the current vocab state **AND**
|
||||||
|
# we still want to load the source model vectors to perform
|
||||||
|
# the vectors check. Since the source vectors clobber the
|
||||||
|
# current ones, we save the original vocab state and
|
||||||
|
# restore after this loop. Existing strings are preserved
|
||||||
|
# during deserialization, so they do not need any
|
||||||
|
# additional handling.
|
||||||
|
if vocab_b is None:
|
||||||
|
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
if model not in source_nlps:
|
if model not in source_nlps:
|
||||||
# We only need the components here and we intentionally
|
# Load with the same vocab, adding any strings
|
||||||
# do not load the model with the same vocab because
|
source_nlps[model] = util.load_model(
|
||||||
# this would cause the vectors to be copied into the
|
model, vocab=nlp.vocab, exclude=["lookups"]
|
||||||
# current nlp object (all the strings will be added in
|
)
|
||||||
# create_pipe_from_source)
|
|
||||||
source_nlps[model] = util.load_model(model)
|
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
listeners_replaced = False
|
listeners_replaced = False
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
|
@ -1764,6 +1777,9 @@ class Language:
|
||||||
# Delete from cache if listeners were replaced
|
# Delete from cache if listeners were replaced
|
||||||
if listeners_replaced:
|
if listeners_replaced:
|
||||||
del source_nlps[model]
|
del source_nlps[model]
|
||||||
|
# Restore the original vocab after sourcing if necessary
|
||||||
|
if vocab_b is not None:
|
||||||
|
nlp.vocab.from_bytes(vocab_b)
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
nlp.batch_size = config["nlp"]["batch_size"]
|
nlp.batch_size = config["nlp"]["batch_size"]
|
||||||
|
|
|
@ -177,13 +177,14 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
# Add 'RIGHT_ATTRS' to self._patterns[key]
|
# Add 'RIGHT_ATTRS' to self._patterns[key]
|
||||||
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
|
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
|
||||||
|
pattern_offset = len(self._patterns[key])
|
||||||
self._patterns[key].extend(_patterns)
|
self._patterns[key].extend(_patterns)
|
||||||
|
|
||||||
# Add each node pattern of all the input patterns individually to the
|
# Add each node pattern of all the input patterns individually to the
|
||||||
# matcher. This enables only a single instance of Matcher to be used.
|
# matcher. This enables only a single instance of Matcher to be used.
|
||||||
# Multiple adds are required to track each node pattern.
|
# Multiple adds are required to track each node pattern.
|
||||||
tokens_to_key_list = []
|
tokens_to_key_list = []
|
||||||
for i, current_patterns in enumerate(_patterns):
|
for i, current_patterns in enumerate(_patterns, start=pattern_offset):
|
||||||
|
|
||||||
# Preallocate list space
|
# Preallocate list space
|
||||||
tokens_to_key = [None] * len(current_patterns)
|
tokens_to_key = [None] * len(current_patterns)
|
||||||
|
@ -263,7 +264,9 @@ cdef class DependencyMatcher:
|
||||||
self._raw_patterns.pop(key)
|
self._raw_patterns.pop(key)
|
||||||
self._tree.pop(key)
|
self._tree.pop(key)
|
||||||
self._root.pop(key)
|
self._root.pop(key)
|
||||||
self._tokens_to_key.pop(key)
|
for mklist in self._tokens_to_key.pop(key):
|
||||||
|
for mkey in mklist:
|
||||||
|
self._matcher.remove(mkey)
|
||||||
|
|
||||||
def _get_keys_to_position_maps(self, doc):
|
def _get_keys_to_position_maps(self, doc):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -208,7 +208,7 @@ class Lemmatizer(Pipe):
|
||||||
univ_pos = token.pos_.lower()
|
univ_pos = token.pos_.lower()
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
if univ_pos == "":
|
if univ_pos == "":
|
||||||
warnings.warn(Warnings.W108.format(text=string))
|
warnings.warn(Warnings.W108)
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(token):
|
if self.is_base_form(token):
|
||||||
|
|
|
@ -4,6 +4,7 @@ from spacy.util import get_lang_class
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
parser.addoption("--slow", action="store_true", help="include slow tests")
|
parser.addoption("--slow", action="store_true", help="include slow tests")
|
||||||
|
parser.addoption("--issue", action="store", help="test specific issues")
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_setup(item):
|
def pytest_runtest_setup(item):
|
||||||
|
@ -16,10 +17,24 @@ def pytest_runtest_setup(item):
|
||||||
# options weren't given.
|
# options weren't given.
|
||||||
return item.config.getoption(f"--{opt}", False)
|
return item.config.getoption(f"--{opt}", False)
|
||||||
|
|
||||||
|
# Integration of boolean flags
|
||||||
for opt in ["slow"]:
|
for opt in ["slow"]:
|
||||||
if opt in item.keywords and not getopt(opt):
|
if opt in item.keywords and not getopt(opt):
|
||||||
pytest.skip(f"need --{opt} option to run")
|
pytest.skip(f"need --{opt} option to run")
|
||||||
|
|
||||||
|
# Special integration to mark tests with issue numbers
|
||||||
|
issues = getopt("issue")
|
||||||
|
if isinstance(issues, str):
|
||||||
|
if "issue" in item.keywords:
|
||||||
|
# Convert issues provided on the CLI to list of ints
|
||||||
|
issue_nos = [int(issue.strip()) for issue in issues.split(",")]
|
||||||
|
# Get all issues specified by decorators and check if they're provided
|
||||||
|
issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")]
|
||||||
|
if not any([ref in issue_nos for ref in issue_refs]):
|
||||||
|
pytest.skip(f"not referencing specified issues: {issue_nos}")
|
||||||
|
else:
|
||||||
|
pytest.skip("not referencing any issues")
|
||||||
|
|
||||||
|
|
||||||
# Fixtures for language tokenizers (languages sorted alphabetically)
|
# Fixtures for language tokenizers (languages sorted alphabetically)
|
||||||
|
|
||||||
|
|
|
@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
|
||||||
assert doc_match[0] == span_match[0]
|
assert doc_match[0] == span_match[0]
|
||||||
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
|
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
|
||||||
assert doc_t_i == span_t_i + offset
|
assert doc_t_i == span_t_i + offset
|
||||||
|
|
||||||
|
|
||||||
|
def test_dependency_matcher_order_issue(en_tokenizer):
|
||||||
|
# issue from #9263
|
||||||
|
doc = en_tokenizer("I like text")
|
||||||
|
doc[2].head = doc[1]
|
||||||
|
|
||||||
|
# this matches on attrs but not rel op
|
||||||
|
pattern1 = [
|
||||||
|
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
|
||||||
|
{
|
||||||
|
"LEFT_ID": "root",
|
||||||
|
"RIGHT_ID": "r",
|
||||||
|
"RIGHT_ATTRS": {"ORTH": "text"},
|
||||||
|
"REL_OP": "<",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# this matches on rel op but not attrs
|
||||||
|
pattern2 = [
|
||||||
|
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
|
||||||
|
{
|
||||||
|
"LEFT_ID": "root",
|
||||||
|
"RIGHT_ID": "r",
|
||||||
|
"RIGHT_ATTRS": {"ORTH": "fish"},
|
||||||
|
"REL_OP": ">",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||||
|
|
||||||
|
# This should behave the same as the next pattern
|
||||||
|
matcher.add("check", [pattern1, pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert matches == []
|
||||||
|
|
||||||
|
# use a new matcher
|
||||||
|
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||||
|
# adding one at a time under same label gets a match
|
||||||
|
matcher.add("check", [pattern1])
|
||||||
|
matcher.add("check", [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert matches == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_dependency_matcher_remove(en_tokenizer):
|
||||||
|
# issue from #9263
|
||||||
|
doc = en_tokenizer("The red book")
|
||||||
|
doc[1].head = doc[2]
|
||||||
|
|
||||||
|
# this matches
|
||||||
|
pattern1 = [
|
||||||
|
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
|
||||||
|
{
|
||||||
|
"LEFT_ID": "root",
|
||||||
|
"RIGHT_ID": "r",
|
||||||
|
"RIGHT_ATTRS": {"ORTH": "red"},
|
||||||
|
"REL_OP": ">",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# add and then remove it
|
||||||
|
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||||
|
matcher.add("check", [pattern1])
|
||||||
|
matcher.remove("check")
|
||||||
|
|
||||||
|
# this matches on rel op but not attrs
|
||||||
|
pattern2 = [
|
||||||
|
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
|
||||||
|
{
|
||||||
|
"LEFT_ID": "root",
|
||||||
|
"RIGHT_ID": "r",
|
||||||
|
"RIGHT_ATTRS": {"ORTH": "blue"},
|
||||||
|
"REL_OP": ">",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Adding this new pattern with the same label, which should not match
|
||||||
|
matcher.add("check", [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert matches == []
|
||||||
|
|
0
spacy/tests/package/__init__.py
Normal file
0
spacy/tests/package/__init__.py
Normal file
|
@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results):
|
||||||
doc = nlp.make_doc("Greater London")
|
doc = nlp.make_doc("Greater London")
|
||||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||||
indices = ngram_suggester([doc])[0].dataXd
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
scores = numpy.asarray(
|
scores = numpy.asarray(
|
||||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||||
|
|
|
@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
|
||||||
# All results should be the same because of the fixed seed
|
# All results should be the same because of the fixed seed
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
|
||||||
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
|
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
|
||||||
|
|
||||||
|
|
||||||
def test_issue5838():
|
def test_issue5838():
|
||||||
|
|
54
spacy/tests/regression/test_issue7716.py
Normal file
54
spacy/tests/regression/test_issue7716.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
import pytest
|
||||||
|
from thinc.api import Adam
|
||||||
|
from spacy.attrs import NORM
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy import registry
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.pipeline import DependencyParser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab():
|
||||||
|
return Vocab(lex_attr_getters={NORM: lambda s: s})
|
||||||
|
|
||||||
|
|
||||||
|
def _parser_example(parser):
|
||||||
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
|
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
|
||||||
|
return Example.from_dict(doc, gold)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def parser(vocab):
|
||||||
|
vocab.strings.add("ROOT")
|
||||||
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
|
parser = DependencyParser(vocab, model)
|
||||||
|
parser.cfg["token_vector_width"] = 4
|
||||||
|
parser.cfg["hidden_width"] = 32
|
||||||
|
# parser.add_label('right')
|
||||||
|
parser.add_label("left")
|
||||||
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
losses = {}
|
||||||
|
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||||
|
example = Example.from_dict(
|
||||||
|
doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
|
||||||
|
)
|
||||||
|
parser.update([example], sgd=sgd, losses=losses)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(reason="Not fixed yet")
|
||||||
|
def test_partial_annotation(parser):
|
||||||
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
|
doc[2].is_sent_start = False
|
||||||
|
# Note that if the following line is used, then doc[2].is_sent_start == False
|
||||||
|
# doc[3].is_sent_start = False
|
||||||
|
|
||||||
|
doc = parser(doc)
|
||||||
|
assert doc[2].is_sent_start == False
|
|
@ -1,6 +1,8 @@
|
||||||
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(8168)
|
||||||
def test_issue8168():
|
def test_issue8168():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
|
|
@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
|
||||||
assert_array_almost_equal(
|
assert_array_almost_equal(
|
||||||
model1.ops.to_numpy(get_all_params(model1)),
|
model1.ops.to_numpy(get_all_params(model1)),
|
||||||
model2.ops.to_numpy(get_all_params(model2)),
|
model2.ops.to_numpy(get_all_params(model2)),
|
||||||
|
decimal=5,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config):
|
||||||
|
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "en"
|
lang = "en"
|
||||||
pipeline = ["tok2vec", "textcat"]
|
pipeline = ["tok2vec", "textcat_multilabel"]
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
[components.tok2vec]
|
[components.tok2vec]
|
||||||
factory = "tok2vec"
|
factory = "tok2vec"
|
||||||
|
|
||||||
[components.textcat]
|
[components.textcat_multilabel]
|
||||||
factory = "textcat"
|
factory = "textcat_multilabel"
|
||||||
"""
|
"""
|
||||||
config = Config().from_str(nlp_config_string)
|
config = Config().from_str(nlp_config_string)
|
||||||
config["corpora"]["@readers"] = reader
|
config["corpora"]["@readers"] = reader
|
||||||
|
|
|
@ -8,7 +8,7 @@ from thinc.api import NumpyOps
|
||||||
from .doc import Doc
|
from .doc import Doc
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
from ..attrs import SPACY, ORTH, intify_attr
|
from ..attrs import SPACY, ORTH, intify_attr, IDS
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import ensure_path, SimpleFrozenList
|
from ..util import ensure_path, SimpleFrozenList
|
||||||
|
|
||||||
|
@ -64,7 +64,13 @@ class DocBin:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/docbin#init
|
DOCS: https://spacy.io/api/docbin#init
|
||||||
"""
|
"""
|
||||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
int_attrs = [intify_attr(attr) for attr in attrs]
|
||||||
|
if None in int_attrs:
|
||||||
|
non_valid = [attr for attr in attrs if intify_attr(attr) is None]
|
||||||
|
raise KeyError(
|
||||||
|
Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
|
||||||
|
) from None
|
||||||
|
attrs = sorted(int_attrs)
|
||||||
self.version = "0.1"
|
self.version = "0.1"
|
||||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .corpus import Corpus # noqa: F401
|
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
||||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||||
from .alignment import Alignment # noqa: F401
|
from .alignment import Alignment # noqa: F401
|
||||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||||
|
|
|
@ -144,7 +144,12 @@ def load_vectors_into_model(
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load word vectors from an installed model or path into a model instance."""
|
"""Load word vectors from an installed model or path into a model instance."""
|
||||||
try:
|
try:
|
||||||
vectors_nlp = load_model(name)
|
# Load with the same vocab, which automatically adds the vectors to
|
||||||
|
# the current nlp object. Exclude lookups so they are not modified.
|
||||||
|
exclude = ["lookups"]
|
||||||
|
if not add_strings:
|
||||||
|
exclude.append("strings")
|
||||||
|
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
|
||||||
except ConfigValidationError as e:
|
except ConfigValidationError as e:
|
||||||
title = f"Config validation error for vectors {name}"
|
title = f"Config validation error for vectors {name}"
|
||||||
desc = (
|
desc = (
|
||||||
|
@ -158,15 +163,8 @@ def load_vectors_into_model(
|
||||||
if len(vectors_nlp.vocab.vectors.keys()) == 0:
|
if len(vectors_nlp.vocab.vectors.keys()) == 0:
|
||||||
logger.warning(Warnings.W112.format(name=name))
|
logger.warning(Warnings.W112.format(name=name))
|
||||||
|
|
||||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
|
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
|
||||||
if add_strings:
|
|
||||||
# I guess we should add the strings from the vectors_nlp model?
|
|
||||||
# E.g. if someone does a similarity query, they might expect the strings.
|
|
||||||
for key in nlp.vocab.vectors.key2row:
|
|
||||||
if key in vectors_nlp.vocab.strings:
|
|
||||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
|
||||||
|
|
||||||
|
|
||||||
def init_tok2vec(
|
def init_tok2vec(
|
||||||
|
|
|
@ -1475,7 +1475,7 @@ def get_arg_names(func: Callable) -> List[str]:
|
||||||
RETURNS (List[str]): The argument names.
|
RETURNS (List[str]): The argument names.
|
||||||
"""
|
"""
|
||||||
argspec = inspect.getfullargspec(func)
|
argspec = inspect.getfullargspec(func)
|
||||||
return list(set([*argspec.args, *argspec.kwonlyargs]))
|
return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
|
||||||
|
|
||||||
|
|
||||||
def combine_score_weights(
|
def combine_score_weights(
|
||||||
|
|
|
@ -530,7 +530,6 @@ cdef class Vocab:
|
||||||
|
|
||||||
setters = {
|
setters = {
|
||||||
"strings": lambda b: self.strings.from_bytes(b),
|
"strings": lambda b: self.strings.from_bytes(b),
|
||||||
"lexemes": lambda b: self.lexemes_from_bytes(b),
|
|
||||||
"vectors": lambda b: serialize_vectors(b),
|
"vectors": lambda b: serialize_vectors(b),
|
||||||
"lookups": lambda b: self.lookups.from_bytes(b),
|
"lookups": lambda b: self.lookups.from_bytes(b),
|
||||||
}
|
}
|
||||||
|
|
|
@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `input_file` | Input file. ~~Path (positional)~~ |
|
| `input_path` | Input file or directory. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
|
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
|
||||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
|
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
|
||||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
|
||||||
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
|
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
|
||||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
|
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
|
||||||
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
|
| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
|
||||||
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
|
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
|
||||||
|
| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ |
|
||||||
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
|
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
|
||||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
|
||||||
|
| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
|
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
|
||||||
|
|
||||||
|
|
|
@ -474,8 +474,8 @@ The L2 norm of the token's vector representation.
|
||||||
| `like_email` | Does the token resemble an email address? ~~bool~~ |
|
| `like_email` | Does the token resemble an email address? ~~bool~~ |
|
||||||
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
|
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
|
||||||
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
|
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
|
||||||
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
|
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ |
|
||||||
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
|
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ |
|
||||||
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
||||||
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
||||||
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
|
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
|
||||||
|
|
|
@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ----------------------------------------------------- |
|
| --------- | ----------------------------------------------------- |
|
||||||
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
|
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
|
||||||
| `lexemes` | The lexeme data. |
|
|
||||||
| `vectors` | The word vectors, if available. |
|
| `vectors` | The word vectors, if available. |
|
||||||
| `lookups` | The lookup tables, if available. |
|
| `lookups` | The lookup tables, if available. |
|
||||||
|
|
|
@ -25,7 +25,7 @@ for token in doc:
|
||||||
|
|
||||||
> - **Text:** The original word text.
|
> - **Text:** The original word text.
|
||||||
> - **Lemma:** The base form of the word.
|
> - **Lemma:** The base form of the word.
|
||||||
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
|
> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/)
|
||||||
> part-of-speech tag.
|
> part-of-speech tag.
|
||||||
> - **Tag:** The detailed part-of-speech tag.
|
> - **Tag:** The detailed part-of-speech tag.
|
||||||
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
|
||||||
|
|
|
@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test
|
||||||
## Troubleshooting guide {#troubleshooting}
|
## Troubleshooting guide {#troubleshooting}
|
||||||
|
|
||||||
This section collects some of the most common errors you may come across when
|
This section collects some of the most common errors you may come across when
|
||||||
installing, loading and using spaCy, as well as their solutions.
|
installing, loading and using spaCy, as well as their solutions. Also see the
|
||||||
|
[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226),
|
||||||
|
which is updated more frequently and covers more transitory issues.
|
||||||
|
|
||||||
> #### Help us improve this guide
|
> #### Help us improve this guide
|
||||||
>
|
>
|
||||||
|
@ -311,62 +313,6 @@ language's `Language` class instead, for example
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
<Accordion title="No such option: --no-cache-dir" id="no-cache-dir">
|
|
||||||
|
|
||||||
```
|
|
||||||
no such option: --no-cache-dir
|
|
||||||
```
|
|
||||||
|
|
||||||
The `download` command uses pip to install the pipeline packages and sets the
|
|
||||||
`--no-cache-dir` flag to prevent it from requiring too much memory.
|
|
||||||
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
|
|
||||||
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
|
|
||||||
version of pip. To see which version you have installed, run `pip --version`.
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
<Accordion title="sre_constants.error: bad character range" id="narrow-unicode">
|
|
||||||
|
|
||||||
```
|
|
||||||
sre_constants.error: bad character range
|
|
||||||
```
|
|
||||||
|
|
||||||
In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions
|
|
||||||
for tokenization to make it up to 2-3 times faster. But this also means that
|
|
||||||
it's very important now that you run spaCy with a wide unicode build of Python.
|
|
||||||
This means that the build has 1114111 unicode characters available, instead of
|
|
||||||
only 65535 in a narrow unicode build. You can check this by running the
|
|
||||||
following command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python -c "import sys; print(sys.maxunicode)"
|
|
||||||
```
|
|
||||||
|
|
||||||
If you're running a narrow unicode build, reinstall Python and use a wide
|
|
||||||
unicode build instead. You can also rebuild Python and set the
|
|
||||||
`--enable-unicode=ucs4` flag.
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
<Accordion title="Unknown locale: UTF-8" id="unknown-locale">
|
|
||||||
|
|
||||||
```
|
|
||||||
ValueError: unknown locale: UTF-8
|
|
||||||
```
|
|
||||||
|
|
||||||
This error can sometimes occur on OSX and is likely related to a still
|
|
||||||
unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy
|
|
||||||
to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then
|
|
||||||
run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both
|
|
||||||
lines** for `LC_ALL` and `LANG`.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ export LC_ALL=en_US.UTF-8
|
|
||||||
$ export LANG=en_US.UTF-8
|
|
||||||
```
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
<Accordion title="Import error: No module named spacy" id="import-error">
|
<Accordion title="Import error: No module named spacy" id="import-error">
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -1363,20 +1363,19 @@
|
||||||
"url": "https://explosion.ai/demos/sense2vec",
|
"url": "https://explosion.ai/demos/sense2vec",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"from sense2vec import Sense2VecComponent",
|
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load('en')",
|
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||||
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
|
"s2v = nlp.add_pipe(\"sense2vec\")",
|
||||||
"nlp.add_pipe(s2v)",
|
"s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")",
|
||||||
"",
|
"",
|
||||||
"doc = nlp(\"A sentence about natural language processing.\")",
|
"doc = nlp(\"A sentence about natural language processing.\")",
|
||||||
"assert doc[3].text == 'natural language processing'",
|
"assert doc[3:6].text == \"natural language processing\"",
|
||||||
"freq = doc[3]._.s2v_freq",
|
"freq = doc[3:6]._.s2v_freq",
|
||||||
"vector = doc[3]._.s2v_vec",
|
"vector = doc[3:6]._.s2v_vec",
|
||||||
"most_similar = doc[3]._.s2v_most_similar(3)",
|
"most_similar = doc[3:6]._.s2v_most_similar(3)",
|
||||||
"# [(('natural language processing', 'NOUN'), 1.0),",
|
"# [(('machine learning', 'NOUN'), 0.8986967),",
|
||||||
"# (('machine learning', 'NOUN'), 0.8986966609954834),",
|
"# (('computer vision', 'NOUN'), 0.8636297),",
|
||||||
"# (('computer vision', 'NOUN'), 0.8636297583580017)]"
|
"# (('deep learning', 'NOUN'), 0.8573361)]"
|
||||||
],
|
],
|
||||||
"category": ["pipeline", "standalone", "visualizers"],
|
"category": ["pipeline", "standalone", "visualizers"],
|
||||||
"tags": ["vectors"],
|
"tags": ["vectors"],
|
||||||
|
@ -2970,11 +2969,10 @@
|
||||||
"github": "thomasthiebaud/spacy-fastlang",
|
"github": "thomasthiebaud/spacy-fastlang",
|
||||||
"pip": "spacy_fastlang",
|
"pip": "spacy_fastlang",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy_fastlang",
|
||||||
"from spacy_fastlang import LanguageDetector",
|
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load('en_core_web_sm')",
|
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||||
"nlp.add_pipe(LanguageDetector())",
|
"nlp.add_pipe(\"language_detector\")",
|
||||||
"doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
|
"doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
|
||||||
"",
|
"",
|
||||||
"assert doc._.language == 'en'",
|
"assert doc._.language == 'en'",
|
||||||
|
@ -3476,7 +3474,51 @@
|
||||||
"github": "bbieniek"
|
"github": "bbieniek"
|
||||||
},
|
},
|
||||||
"category": ["apis"]
|
"category": ["apis"]
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
"id": "phruzz_matcher",
|
||||||
|
"title": "phruzz-matcher",
|
||||||
|
"slogan": "Phrase matcher using RapidFuzz",
|
||||||
|
"description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.",
|
||||||
|
"github": "mjvallone/phruzz-matcher",
|
||||||
|
"pip": "phruzz_matcher",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"from spacy.language import Language",
|
||||||
|
"from phruzz_matcher.phrase_matcher import PhruzzMatcher",
|
||||||
|
"",
|
||||||
|
"famous_people = [",
|
||||||
|
" \"Brad Pitt\",",
|
||||||
|
" \"Demi Moore\",",
|
||||||
|
" \"Bruce Willis\",",
|
||||||
|
" \"Jim Carrey\",",
|
||||||
|
"]",
|
||||||
|
"",
|
||||||
|
"@Language.factory(\"phrase_matcher\")",
|
||||||
|
"def phrase_matcher(nlp: Language, name: str):",
|
||||||
|
" return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.blank('es')",
|
||||||
|
"nlp.add_pipe(\"phrase_matcher\")",
|
||||||
|
"",
|
||||||
|
"doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")",
|
||||||
|
"print(f\"doc.ents: {doc.ents}\")",
|
||||||
|
"",
|
||||||
|
"#OUTPUT",
|
||||||
|
"#doc.ents: (brad pit, Demi Moore)"
|
||||||
|
],
|
||||||
|
"thumb": "https://avatars.githubusercontent.com/u/961296?v=4",
|
||||||
|
"image": "",
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Martin Vallone",
|
||||||
|
"author_links": {
|
||||||
|
"github": "mjvallone",
|
||||||
|
"twitter": "vallotin",
|
||||||
|
"website": "https://fiqus.coop/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "research", "standalone"],
|
||||||
|
"tags": ["spacy", "python", "nlp", "ner"]
|
||||||
|
}
|
||||||
],
|
],
|
||||||
|
|
||||||
"categories": [
|
"categories": [
|
||||||
|
|
|
@ -34,6 +34,7 @@ const MODEL_META = {
|
||||||
core_sm: 'Vocabulary, syntax, entities',
|
core_sm: 'Vocabulary, syntax, entities',
|
||||||
dep: 'Vocabulary, syntax',
|
dep: 'Vocabulary, syntax',
|
||||||
ent: 'Named entities',
|
ent: 'Named entities',
|
||||||
|
sent: 'Sentence boundaries',
|
||||||
pytt: 'PyTorch Transformers',
|
pytt: 'PyTorch Transformers',
|
||||||
trf: 'Transformers',
|
trf: 'Transformers',
|
||||||
vectors: 'Word vectors',
|
vectors: 'Word vectors',
|
||||||
|
@ -195,6 +196,7 @@ const Model = ({
|
||||||
const [isError, setIsError] = useState(true)
|
const [isError, setIsError] = useState(true)
|
||||||
const [meta, setMeta] = useState({})
|
const [meta, setMeta] = useState({})
|
||||||
const { type, genre, size } = getModelComponents(name)
|
const { type, genre, size } = getModelComponents(name)
|
||||||
|
const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type
|
||||||
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
|
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
|
||||||
name,
|
name,
|
||||||
compatibility,
|
compatibility,
|
||||||
|
@ -231,7 +233,7 @@ const Model = ({
|
||||||
|
|
||||||
const rows = [
|
const rows = [
|
||||||
{ label: 'Language', tag: langId, content: langName },
|
{ label: 'Language', tag: langId, content: langName },
|
||||||
{ label: 'Type', tag: type, content: MODEL_META[type] },
|
{ label: 'Type', tag: type, content: MODEL_META[display_type] },
|
||||||
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
|
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
|
||||||
{ label: 'Size', tag: size, content: meta.sizeFull },
|
{ label: 'Size', tag: size, content: meta.sizeFull },
|
||||||
{ label: 'Components', content: components, help: MODEL_META.components },
|
{ label: 'Components', content: components, help: MODEL_META.components },
|
||||||
|
|
Loading…
Reference in New Issue
Block a user