diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 70882c3cc..555322782 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -16,7 +16,7 @@ jobs:
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- - run: pip install black
+ - run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1f396bd71..f6f6dab59 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both
tools installed.
+As a general rule of thumb, we use f-strings for any formatting of strings.
+One exception are calls to Python's `logging` functionality.
+To avoid unnecessary string conversions in these cases, we use string formatting
+templates with `%s` and `%d` etc.
+
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 541656c3d..dba11bd1a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -41,7 +41,7 @@ jobs:
inputs:
versionSpec: "3.7"
- script: |
- pip install black==22.3.0
+ pip install black -c requirements.txt
python -m black spacy --check
displayName: "black"
- script: |
diff --git a/requirements.txt b/requirements.txt
index db7d3a5bd..fc3629376 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,10 +31,10 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
-black>=22.0,<23.0
+black==22.3.0
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ba3892b1d..f104feff9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -90,9 +90,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
- logger.debug(f"Config overrides from CLI: {keys}")
+ logger.debug("Config overrides from CLI: %s", keys)
if env_overrides:
- logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+ logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides}
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 324c5d1bb..6351f28eb 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -252,7 +252,7 @@ def get_third_party_dependencies(
raise regerr from None
module_name = func_info.get("module") # type: ignore[attr-defined]
if module_name: # the code is part of a module, not a --code file
- modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
+ modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
dependencies = []
for module_name in modules:
if module_name in distributions:
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 6e3cde88c..8894baa50 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
- logger.debug(f"CMD: {cmd['name']}.")
+ logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
- f"URL: {url} for {output_path} with command hash {cmd_hash}"
+ "URL: %s for %s with command hash %s",
+ url,
+ output_path,
+ cmd_hash,
)
yield url, output_path
@@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i)
break
else:
- logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
+ logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index bc779e9cd..a8178de21 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
- logger.debug(f"CMD: cmd['name']")
+ logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
- logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
+ logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
- logger.debug(f"CMD_HASH: {cmd_hash}")
+ logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc),
)
logger.debug(
- f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+ "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url
diff --git a/spacy/errors.py b/spacy/errors.py
index ab8e69487..23d005369 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
- E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
- "`kb.add_entity` and `kb.add_alias` to add entries.")
+ E139 = ("Knowledge base for component '{name}' is empty.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
@@ -967,7 +966,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
- "or use `auto_switch_port=True` to pick an available port automatically.")
+ "or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index edba523cf..2a74d047b 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
+ def is_empty(self):
+ return len(self) == 0
+
def __len__(self):
return self.get_size_entities()
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 6963e8b79..28e5085a8 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
-
-
-# Punctuation stolen from Danish
-from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
class SwedishDefaults(BaseDefaults):
diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py
new file mode 100644
index 000000000..67f1bcdc4
--- /dev/null
+++ b/spacy/lang/sv/punctuation.py
@@ -0,0 +1,33 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import TOKENIZER_SUFFIXES
+
+
+_quotes = CONCAT_QUOTES.replace("'", "")
+
+_infixes = (
+ LIST_ELLIPSES
+ + LIST_ICONS
+ + [
+ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+ r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
+ r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
+ ]
+)
+
+_suffixes = [
+ suffix
+ for suffix in TOKENIZER_SUFFIXES
+ if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
+]
+_suffixes += [r"(?<=[^sSxXzZ])\'"]
+
+
+TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/language.py b/spacy/language.py
index e0abfd5e7..9fdcf6328 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
- util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+ util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)
return lookups
@@ -1969,7 +1969,7 @@ class Language:
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
- util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+ util.logger.debug("Replacing listeners of component '%s'", pipe_name)
if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 74c2d002f..adf96702b 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -82,8 +82,12 @@ cdef class DependencyMatcher:
"$-": self._imm_left_sib,
"$++": self._right_sib,
"$--": self._left_sib,
+ ">+": self._imm_right_child,
+ ">-": self._imm_left_child,
">++": self._right_child,
">--": self._left_child,
+ "<+": self._imm_right_parent,
+ "<-": self._imm_left_parent,
"<++": self._right_parent,
"<--": self._left_parent,
}
@@ -427,12 +431,34 @@ cdef class DependencyMatcher:
def _left_sib(self, doc, node):
return [doc[child.i] for child in doc[node].head.children if child.i < node]
+ def _imm_right_child(self, doc, node):
+ for child in doc[node].children:
+ if child.i == node + 1:
+ return [doc[child.i]]
+ return []
+
+ def _imm_left_child(self, doc, node):
+ for child in doc[node].children:
+ if child.i == node - 1:
+ return [doc[child.i]]
+ return []
+
def _right_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i > node]
def _left_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i < node]
+ def _imm_right_parent(self, doc, node):
+ if doc[node].head.i == node + 1:
+ return [doc[node].head]
+ return []
+
+ def _imm_left_parent(self, doc, node):
+ if doc[node].head.i == node - 1:
+ return [doc[node].head]
+ return []
+
def _right_parent(self, doc, node):
if doc[node].head.i > node:
return [doc[node].head]
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index ea1b4b66b..b886bd2ec 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -828,6 +828,11 @@ def _get_attr_values(spec, string_store):
return attr_values
+def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
+ # tuple order affects performance
+ return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
+
+
# These predicate helper classes are used to match the REGEX, IN, >= etc
# extensions to the matcher introduced in #3173.
@@ -847,7 +852,7 @@ class _FuzzyPredicate:
fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_compare = fuzzy_compare
- self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+ self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
def __call__(self, Token token):
if self.is_extension:
@@ -869,7 +874,7 @@ class _RegexPredicate:
self.value = re.compile(value)
self.predicate = predicate
self.is_extension = is_extension
- self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+ self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@@ -905,7 +910,7 @@ class _SetPredicate:
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
- self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+ self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@@ -977,7 +982,7 @@ class _ComparisonPredicate:
self.value = value
self.predicate = predicate
self.is_extension = is_extension
- self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+ self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@@ -1092,7 +1097,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
if isinstance(value, dict):
for type_, cls in predicate_types.items():
if type_ in value:
- key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
+ key = _predicate_cache_key(attr, type_, value[type_])
if key in seen_predicates:
output.append(seen_predicates[key])
else:
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 62845287b..a11964117 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -250,7 +250,7 @@ class EntityLinker(TrainablePipe):
# Raise an error if the knowledge base is not initialized.
if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name))
- if len(self.kb) == 0:
+ if hasattr(self.kb, "is_empty") and self.kb.is_empty():
raise ValueError(Errors.E139.format(name=self.name))
def initialize(
diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
index bbb0ff415..0aa495992 100644
--- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
@@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
+
+
+@pytest.mark.issue(12311)
+@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
+def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
+ tokens = sv_tokenizer(text)
+ assert len(tokens) == 1
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index b4e19d69d..200384320 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
("the", "brown", "$--", 0),
("brown", "the", "$--", 1),
("brown", "brown", "$--", 0),
+ ("over", "jumped", "<+", 0),
+ ("quick", "fox", "<+", 0),
+ ("the", "quick", "<+", 0),
+ ("brown", "fox", "<+", 1),
("quick", "fox", "<++", 1),
("quick", "over", "<++", 0),
("over", "jumped", "<++", 0),
("the", "fox", "<++", 2),
+ ("brown", "fox", "<-", 0),
+ ("fox", "over", "<-", 0),
+ ("the", "over", "<-", 0),
+ ("over", "jumped", "<-", 1),
("brown", "fox", "<--", 0),
("fox", "jumped", "<--", 0),
("fox", "over", "<--", 1),
+ ("fox", "brown", ">+", 0),
+ ("over", "fox", ">+", 0),
+ ("over", "the", ">+", 0),
+ ("jumped", "over", ">+", 1),
("jumped", "over", ">++", 1),
("fox", "lazy", ">++", 0),
("over", "the", ">++", 0),
+ ("jumped", "over", ">-", 0),
+ ("fox", "quick", ">-", 0),
+ ("brown", "quick", ">-", 0),
+ ("fox", "brown", ">-", 1),
("brown", "fox", ">--", 0),
("fox", "brown", ">--", 1),
("jumped", "fox", ">--", 1),
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 99f164f15..2a6258386 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -353,6 +353,9 @@ def test_kb_default(nlp):
"""Test that the default (empty) KB is loaded upon construction"""
entity_linker = nlp.add_pipe("entity_linker", config={})
assert len(entity_linker.kb) == 0
+ with pytest.raises(ValueError, match="E139"):
+ # this raises an error because the KB is empty
+ entity_linker.validate_kb()
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0
# 64 is the default value from pipeline.entity_linker
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 40100412a..8aaadf686 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,5 +1,7 @@
import os
from pathlib import Path
+import pytest
+import srsly
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc
@@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
# Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
+
+
+# project tests
+
+SAMPLE_PROJECT = {
+ "title": "Sample project",
+ "description": "This is a project for testing",
+ "assets": [
+ {
+ "dest": "assets/spacy-readme.md",
+ "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
+ "checksum": "411b2c89ccf34288fae8ed126bf652f7",
+ },
+ {
+ "dest": "assets/citation.cff",
+ "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
+ "checksum": "c996bfd80202d480eb2e592369714e5e",
+ "extra": True,
+ },
+ ],
+ "commands": [
+ {
+ "name": "ok",
+ "help": "print ok",
+ "script": ["python -c \"print('okokok')\""],
+ },
+ {
+ "name": "create",
+ "help": "make a file",
+ "script": ["touch abc.txt"],
+ "outputs": ["abc.txt"],
+ },
+ {
+ "name": "clean",
+ "help": "remove test file",
+ "script": ["rm abc.txt"],
+ },
+ ],
+}
+
+SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
+
+
+@pytest.fixture
+def project_dir():
+ with make_tempdir() as pdir:
+ (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
+ yield pdir
+
+
+def test_project_document(project_dir):
+ readme_path = project_dir / "README.md"
+ assert not readme_path.exists(), "README already exists"
+ result = CliRunner().invoke(
+ app, ["project", "document", str(project_dir), "-o", str(readme_path)]
+ )
+ assert result.exit_code == 0
+ assert readme_path.is_file()
+ text = readme_path.read_text("utf-8")
+ assert SAMPLE_PROJECT["description"] in text
+
+
+def test_project_assets(project_dir):
+ asset_dir = project_dir / "assets"
+ assert not asset_dir.exists(), "Assets dir is already present"
+ result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
+ assert result.exit_code == 0
+ assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
+ # check that extras work
+ result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
+ assert result.exit_code == 0
+ assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
+
+
+def test_project_run(project_dir):
+ # make sure dry run works
+ test_file = project_dir / "abc.txt"
+ result = CliRunner().invoke(
+ app, ["project", "run", "--dry", "create", str(project_dir)]
+ )
+ assert result.exit_code == 0
+ assert not test_file.is_file()
+ result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
+ assert result.exit_code == 0
+ assert test_file.is_file()
+ result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
+ assert result.exit_code == 0
+ assert "okokok" in result.stdout
+
+
+@pytest.mark.parametrize(
+ "options",
+ [
+ "",
+ # "--sparse",
+ "--branch v3",
+ "--repo https://github.com/explosion/projects --branch v3",
+ ],
+)
+def test_project_clone(options):
+ with make_tempdir() as workspace:
+ out = workspace / "project"
+ target = "benchmarks/ner_conll03"
+ if not options:
+ options = []
+ else:
+ options = options.split()
+ result = CliRunner().invoke(
+ app, ["project", "clone", target, *options, str(out)]
+ )
+ assert result.exit_code == 0
+ assert (out / "README.md").is_file()
+
+
+def test_project_push_pull(project_dir):
+ proj = dict(SAMPLE_PROJECT)
+ remote = "xyz"
+
+ with make_tempdir() as remote_dir:
+ proj["remotes"] = {remote: str(remote_dir)}
+ proj_text = srsly.yaml_dumps(proj)
+ (project_dir / "project.yml").write_text(proj_text)
+
+ test_file = project_dir / "abc.txt"
+ result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
+ assert result.exit_code == 0
+ assert test_file.is_file()
+ result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
+ assert result.exit_code == 0
+ result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
+ assert result.exit_code == 0
+ assert not test_file.exists()
+ result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
+ assert result.exit_code == 0
+ assert test_file.is_file()
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 03790eb86..236856dad 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -46,7 +46,7 @@ def assert_sents_error(doc):
def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy")
- logger.warning(f"Trouble with component {proc_name}.")
+ logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 426fddf90..7e2494f5b 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -11,7 +11,7 @@ def create_copy_from_base_model(
) -> Callable[[Language], Language]:
def copy_from_base_model(nlp):
if tokenizer:
- logger.info(f"Copying tokenizer from: {tokenizer}")
+ logger.info("Copying tokenizer from: %s", tokenizer)
base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@@ -23,7 +23,7 @@ def create_copy_from_base_model(
)
)
if vocab:
- logger.info(f"Copying vocab from: {vocab}")
+ logger.info("Copying vocab from: %s", vocab)
# only reload if the vocab is from a different model
if tokenizer != vocab:
base_nlp = load_model(vocab)
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index d626ad0e0..086ad831c 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -29,7 +29,7 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
- util.logger.debug(f"Loading corpus from path: {path}")
+ util.logger.debug("Loading corpus from path: %s", path)
return Corpus(
path,
gold_preproc=gold_preproc,
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 6304e4a84..e90617852 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components]
- logger.info(f"Pipeline: {nlp.pipe_names}")
+ logger.info("Pipeline: %s", nlp.pipe_names)
if resume_components:
with nlp.select_pipes(enable=resume_components):
- logger.info(f"Resuming training for: {resume_components}")
+ logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
nlp._link_components()
@@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
if T["max_epochs"] == -1:
sample_size = 100
logger.debug(
- f"Due to streamed train corpus, using only first {sample_size} "
- f"examples for initialization. If necessary, provide all labels "
- f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+ "Due to streamed train corpus, using only first %s examples for initialization. "
+ "If necessary, provide all labels in [initialize]. "
+ "More info: https://spacy.io/api/cli#init_labels",
+ sample_size,
)
nlp.initialize(
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
- logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+ logger.info("Initialized pipeline components: %s", nlp.pipe_names)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(
@@ -109,7 +110,7 @@ def init_vocab(
) -> None:
if lookups:
nlp.vocab.lookups = lookups
- logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+ logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
@@ -125,11 +126,11 @@ def init_vocab(
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
- logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+ logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
logger.info("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
- logger.info(f"Added vectors: {vectors}")
+ logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@@ -191,7 +192,7 @@ def init_tok2vec(
if weights_data is not None:
layer = get_tok2vec_ref(nlp, P)
layer.from_bytes(weights_data)
- logger.info(f"Loaded pretrained weights from {init_tok2vec}")
+ logger.info("Loaded pretrained weights from %s", init_tok2vec)
return True
return False
@@ -216,13 +217,13 @@ def convert_vectors(
nlp.vocab.deduplicate_vectors()
else:
if vectors_loc:
- logger.info(f"Reading vectors from {vectors_loc}")
+ logger.info("Reading vectors from %s", vectors_loc)
vectors_data, vector_keys, floret_settings = read_vectors(
vectors_loc,
truncate,
mode=mode,
)
- logger.info(f"Loaded vectors from {vectors_loc}")
+ logger.info("Loaded vectors from %s", vectors_loc)
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None and mode != VectorsMode.floret:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 885257772..eca40e3d9 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
- logger.debug(f"Removed existing output directory: {subdir}")
+ logger.debug("Removed existing output directory: %s", subdir)
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e
diff --git a/spacy/util.py b/spacy/util.py
index 8bf8fb1b0..8cc89217d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -32,6 +32,7 @@ import inspect
import pkgutil
import logging
import socket
+import stat
try:
import cupy.random
@@ -60,7 +61,7 @@ if TYPE_CHECKING:
# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
@@ -144,8 +145,17 @@ class registry(thinc.registry):
return func
@classmethod
- def find(cls, registry_name: str, func_name: str) -> Callable:
- """Get info about a registered function from the registry."""
+ def find(
+ cls, registry_name: str, func_name: str
+ ) -> Dict[str, Optional[Union[str, int]]]:
+ """Find information about a registered function, including the
+ module and path to the file it's defined in, the line number and the
+ docstring, if available.
+
+ registry_name (str): Name of the catalogue registry.
+ func_name (str): Name of the registered function.
+ RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
+ """
# We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name):
@@ -1041,8 +1051,15 @@ def make_tempdir() -> Generator[Path, None, None]:
"""
d = Path(tempfile.mkdtemp())
yield d
+
+ # On Windows, git clones use read-only files, which cause permission errors
+ # when being deleted. This forcibly fixes permissions.
+ def force_remove(rmfunc, path, ex):
+ os.chmod(path, stat.S_IWRITE)
+ rmfunc(path)
+
try:
- shutil.rmtree(str(d))
+ shutil.rmtree(str(d), onerror=force_remove)
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e))
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 2a1bc4380..966b5830a 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -924,6 +924,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
`CandidateGenerator` uses the text of a mention to find its potential aliases in
the `KnowledgeBase`. Note that this function is case-dependent.
+### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
+
+A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
+[`Span`](/api/span) objects denoting named entities, and returns a list of
+plausible [`Candidate`](/api/kb/#candidate) objects per specified
+[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
+mention to find its potential aliases in the `KnowledgeBase`. Note that this
+function is case-dependent.
+
## Coreference {id="coref-architectures",tag="experimental"}
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index bd966015e..3f31bef95 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1492,7 +1492,7 @@ $ python -m spacy project push [remote] [project_dir]
### project pull {id="project-pull",tag="command"}
Download all files or directories listed as `outputs` for commands, unless they
-are not already present locally. When searching for files in the remote, `pull`
+are already present locally. When searching for files in the remote, `pull`
won't just look at the output path, but will also consider the **command
string** and the **hashes of the dependencies**. For instance, let's say you've
previously pushed a checkpoint to the remote, but now you've changed some
diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx
index 390034a6c..14e0916d1 100644
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol | Description |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B` | `A` is the immediate dependent of `B`. |
-| `A > B` | `A` is the immediate head of `B`. |
-| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
-| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
-| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
-| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
-| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
+| Symbol | Description |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `A < B` | `A` is the immediate dependent of `B`. |
+| `A > B` | `A` is the immediate head of `B`. |
+| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
+| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
+| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
+| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
+| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
+| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
+| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
+| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
+| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
+| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
+| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 13c59c4af..0a5826500 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | |
-| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
+| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9748719d7..d0851a59f 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options)
> ```
-| Name | Description |
-| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
-| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
-| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
-| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
-| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
-| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
-| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
-| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
-| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
-| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
-| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
-| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
-| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
-| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
+| Name | Description |
+| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
+| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
+| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
+| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
+| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
+| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
+| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
+| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
+| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
+| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
+| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
#### Named Entity Visualizer options {id="displacy_options-ent"}
diff --git a/website/docs/models/index.mdx b/website/docs/models/index.mdx
index 371e4460f..366d44f0e 100644
--- a/website/docs/models/index.mdx
+++ b/website/docs/models/index.mdx
@@ -21,8 +21,8 @@ menu:
## Package naming conventions {id="conventions"}
In general, spaCy expects all pipeline packages to follow the naming convention
-of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
-into three components:
+of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
+three components:
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
tagging, parsing, lemmatization and named entity recognition, or `dep` for
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 08d2b3b91..55c043015 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1096,20 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
-| Symbol | Description |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B` | `A` is the immediate dependent of `B`. |
-| `A > B` | `A` is the immediate head of `B`. |
-| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
-| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
-| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
-| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
-| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
+| Symbol | Description |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `A < B` | `A` is the immediate dependent of `B`. |
+| `A > B` | `A` is the immediate head of `B`. |
+| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
+| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
+| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
+| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
+| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
+| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
+| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
+| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
+| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
+| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
+| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
+| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
+| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
@@ -1442,8 +1450,8 @@ nlp.to_disk("/path/to/pipeline")
The saved pipeline now includes the `"entity_ruler"` in its
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
-file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
-all pipeline components will be restored and deserialized – including the entity
+file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
+pipeline components will be restored and deserialized – including the entity
ruler. This lets you ship powerful pipeline packages with binary weights _and_
rules included!
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index 1d3682af4..c372744de 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -58,12 +58,12 @@ arcs.
-| Argument | Description |
-| --------- | ----------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
-| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
-| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
-| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
+| Argument | Description |
+| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
+| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
+| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).