mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-30 10:00:04 +03:00
Merge branch 'master' into feature/etl
This commit is contained in:
commit
5a08596f92
44
.github/azure-steps.yml
vendored
44
.github/azure-steps.yml
vendored
|
@ -52,17 +52,17 @@ steps:
|
||||||
python -W error -c "import spacy"
|
python -W error -c "import spacy"
|
||||||
displayName: "Test import"
|
displayName: "Test import"
|
||||||
|
|
||||||
# - script: |
|
- script: |
|
||||||
# python -m spacy download ca_core_news_sm
|
python -m spacy download ca_core_news_sm
|
||||||
# python -m spacy download ca_core_news_md
|
python -m spacy download ca_core_news_md
|
||||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
# displayName: 'Test download CLI'
|
displayName: 'Test download CLI'
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
#
|
|
||||||
# - script: |
|
- script: |
|
||||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
# displayName: 'Test no warnings on load (#11713)'
|
displayName: 'Test no warnings on load (#11713)'
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
@ -86,17 +86,17 @@ steps:
|
||||||
displayName: 'Test train CLI'
|
displayName: 'Test train CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
# - script: |
|
- script: |
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
# displayName: 'Test assemble CLI'
|
displayName: 'Test assemble CLI'
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
#
|
|
||||||
# - script: |
|
- script: |
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
# displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
|
||||||
spacy/tests/package/pyproject.toml
|
spacy/tests/package/pyproject.toml
|
||||||
spacy/tests/package/requirements.txt
|
spacy/tests/package/requirements.txt
|
||||||
|
|
||||||
# Website
|
|
||||||
website/.cache/
|
|
||||||
website/public/
|
|
||||||
website/node_modules
|
|
||||||
website/.npm
|
|
||||||
website/logs
|
|
||||||
*.log
|
|
||||||
npm-debug.log*
|
|
||||||
quickstart-training-generator.js
|
|
||||||
|
|
||||||
# Cython / C extensions
|
# Cython / C extensions
|
||||||
cythonize.json
|
cythonize.json
|
||||||
spacy/*.html
|
spacy/*.html
|
||||||
|
|
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💫 **Version 3.4 out now!**
|
💫 **Version 3.5 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
|
|
|
@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
|
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pre-commit>=2.13.0
|
pre-commit>=2.13.0
|
||||||
cython>=0.25,<3.0
|
cython>=0.25,<3.0
|
||||||
|
|
|
@ -63,7 +63,7 @@ install_requires =
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
|
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||||
langcodes>=3.2.0,<4.0.0
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
|
|
|
@ -106,9 +106,7 @@ def serve(
|
||||||
|
|
||||||
if is_in_jupyter():
|
if is_in_jupyter():
|
||||||
warnings.warn(Warnings.W011)
|
warnings.warn(Warnings.W011)
|
||||||
render(
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
docs, style=style, page=page, minify=minify, options=options, manual=manual
|
|
||||||
)
|
|
||||||
httpd = simple_server.make_server(host, port, app)
|
httpd = simple_server.make_server(host, port, app)
|
||||||
print(f"\nUsing the '{style}' visualizer")
|
print(f"\nUsing the '{style}' visualizer")
|
||||||
print(f"Serving on http://{host}:{port} ...\n")
|
print(f"Serving on http://{host}:{port} ...\n")
|
||||||
|
|
|
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||||
to support entity linking of named entities to real-world concepts.
|
to support entity linking of named entities to real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb_in_memory
|
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, entity_vector_length):
|
def __init__(self, Vocab vocab, entity_vector_length):
|
||||||
|
|
|
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
||||||
max_edits = fuzzy
|
max_edits = fuzzy
|
||||||
else:
|
else:
|
||||||
# allow at least two edits (to allow at least one transposition) and up
|
# allow at least two edits (to allow at least one transposition) and up
|
||||||
# to 20% of the pattern string length
|
# to 30% of the pattern string length
|
||||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,12 @@ from ..vocab import Vocab
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
|
||||||
class Matcher:
|
class Matcher:
|
||||||
def __init__(self, vocab: Vocab, validate: bool = ...,
|
def __init__(
|
||||||
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
validate: bool = ...,
|
||||||
|
fuzzy_compare: Callable[[str, str, int], bool] = ...,
|
||||||
|
) -> None: ...
|
||||||
def __reduce__(self) -> Any: ...
|
def __reduce__(self) -> Any: ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def __contains__(self, key: str) -> bool: ...
|
def __contains__(self, key: str) -> bool: ...
|
||||||
|
|
|
@ -5,8 +5,8 @@ from itertools import islice
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
from thinc.types import Floats2d, Ints2d
|
||||||
|
|
||||||
from ._edit_tree_internals.edit_trees import EditTrees
|
from ._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||||
|
@ -20,6 +20,10 @@ from ..vocab import Vocab
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||||
|
TOP_K_GUARDRAIL = 20
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
self.cfg: Dict[str, Any] = {"labels": []}
|
self.cfg: Dict[str, Any] = {"labels": []}
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.numpy_ops = NumpyOps()
|
||||||
|
|
||||||
def get_loss(
|
def get_loss(
|
||||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||||
|
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||||
|
if self.top_k == 1:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||||
|
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||||
|
else:
|
||||||
|
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||||
|
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||||
|
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||||
|
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||||
|
# be used for other purposes, and with very large values of *top_k* the method
|
||||||
|
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||||
|
# instead.
|
||||||
n_docs = len(list(docs))
|
n_docs = len(list(docs))
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
return guesses
|
return guesses
|
||||||
scores = self.model.predict(docs)
|
scores = self.model.predict(docs)
|
||||||
assert len(scores) == n_docs
|
assert len(scores) == n_docs
|
||||||
guesses = self._scores2guesses(docs, scores)
|
guesses = scores2guesses(docs, scores)
|
||||||
assert len(guesses) == n_docs
|
assert len(guesses) == n_docs
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def _scores2guesses(self, docs, scores):
|
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||||
guesses = []
|
guesses = []
|
||||||
for doc, doc_scores in zip(docs, scores):
|
for doc, doc_scores in zip(docs, scores):
|
||||||
if self.top_k == 1:
|
doc_guesses = doc_scores.argmax(axis=1)
|
||||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||||
else:
|
|
||||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
|
||||||
|
|
||||||
if not isinstance(doc_guesses, np.ndarray):
|
doc_compat_guesses = []
|
||||||
doc_guesses = doc_guesses.get()
|
for i, token in enumerate(doc):
|
||||||
|
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||||
|
if self.trees.apply(tree_id, token.text) is not None:
|
||||||
|
doc_compat_guesses.append(tree_id)
|
||||||
|
else:
|
||||||
|
doc_compat_guesses.append(-1)
|
||||||
|
guesses.append(np.array(doc_compat_guesses))
|
||||||
|
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||||
|
guesses = []
|
||||||
|
top_k = min(self.top_k, len(self.labels))
|
||||||
|
for doc, doc_scores in zip(docs, scores):
|
||||||
|
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||||
|
doc_compat_guesses = []
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
for _ in range(top_k):
|
||||||
|
candidate = int(doc_scores[i].argmax())
|
||||||
|
candidate_tree_id = self.cfg["labels"][candidate]
|
||||||
|
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||||
|
doc_compat_guesses.append(candidate_tree_id)
|
||||||
|
break
|
||||||
|
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||||
|
else:
|
||||||
|
doc_compat_guesses.append(-1)
|
||||||
|
guesses.append(np.array(doc_compat_guesses))
|
||||||
|
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||||
|
guesses = []
|
||||||
|
for doc, doc_scores in zip(docs, scores):
|
||||||
|
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||||
|
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||||
|
|
||||||
doc_compat_guesses = []
|
doc_compat_guesses = []
|
||||||
for token, candidates in zip(doc, doc_guesses):
|
for token, candidates in zip(doc, doc_guesses):
|
||||||
|
|
|
@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
|
||||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||||
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
||||||
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
|
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
|
None, alias="fuzzy1"
|
||||||
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
|
)
|
||||||
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
|
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
|
None, alias="fuzzy2"
|
||||||
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
|
)
|
||||||
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
|
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
|
None, alias="fuzzy3"
|
||||||
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
|
)
|
||||||
|
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy4"
|
||||||
|
)
|
||||||
|
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy5"
|
||||||
|
)
|
||||||
|
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy6"
|
||||||
|
)
|
||||||
|
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy7"
|
||||||
|
)
|
||||||
|
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy8"
|
||||||
|
)
|
||||||
|
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||||
|
None, alias="fuzzy9"
|
||||||
|
)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
|
@ -101,14 +101,15 @@ def test_initialize_from_labels():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_no_data():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_no_data(top_k):
|
||||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||||
TEXTCAT_DATA = [
|
TEXTCAT_DATA = [
|
||||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||||
]
|
]
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("trainable_lemmatizer")
|
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
|
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -119,10 +120,11 @@ def test_no_data():
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_incomplete_data():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_incomplete_data(top_k):
|
||||||
# Test that the lemmatizer works with incomplete information
|
# Test that the lemmatizer works with incomplete information
|
||||||
nlp = English()
|
nlp = English()
|
||||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
lemmatizer.min_tree_freq = 1
|
lemmatizer.min_tree_freq = 1
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in PARTIAL_DATA:
|
for t in PARTIAL_DATA:
|
||||||
|
@ -154,9 +156,10 @@ def test_incomplete_data():
|
||||||
assert xp.count_nonzero(dX[1][1]) == 0
|
assert xp.count_nonzero(dX[1][1]) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||||
|
def test_overfitting_IO(top_k):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
lemmatizer.min_tree_freq = 1
|
lemmatizer.min_tree_freq = 1
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
|
@ -189,7 +192,7 @@ def test_overfitting_IO():
|
||||||
# Check model after a {to,from}_bytes roundtrip
|
# Check model after a {to,from}_bytes roundtrip
|
||||||
nlp_bytes = nlp.to_bytes()
|
nlp_bytes = nlp.to_bytes()
|
||||||
nlp3 = English()
|
nlp3 = English()
|
||||||
nlp3.add_pipe("trainable_lemmatizer")
|
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||||
nlp3.from_bytes(nlp_bytes)
|
nlp3.from_bytes(nlp_bytes)
|
||||||
doc3 = nlp3(test_text)
|
doc3 = nlp3(test_text)
|
||||||
assert doc3[0].lemma_ == "she"
|
assert doc3[0].lemma_ == "she"
|
||||||
|
|
|
@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
|
||||||
def test_download_compatibility():
|
def test_download_compatibility():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
@ -629,7 +628,6 @@ def test_download_compatibility():
|
||||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
|
||||||
def test_validate_compatibility_table():
|
def test_validate_compatibility_table():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
|
9
website/.dockerignore
Normal file
9
website/.dockerignore
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
.cache/
|
||||||
|
.next/
|
||||||
|
public/
|
||||||
|
node_modules
|
||||||
|
.npm
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
quickstart-training-generator.js
|
4
website/.gitignore
vendored
4
website/.gitignore
vendored
|
@ -1,5 +1,7 @@
|
||||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||||
|
|
||||||
|
quickstart-training-generator.js
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
/node_modules
|
/node_modules
|
||||||
/.pnp
|
/.pnp
|
||||||
|
@ -41,4 +43,4 @@ next-env.d.ts
|
||||||
public/robots.txt
|
public/robots.txt
|
||||||
public/sitemap*
|
public/sitemap*
|
||||||
public/sw.js*
|
public/sw.js*
|
||||||
public/workbox*
|
public/workbox*
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
FROM node:11.15.0
|
FROM node:18
|
||||||
|
|
||||||
WORKDIR /spacy-io
|
USER node
|
||||||
|
|
||||||
RUN npm install -g gatsby-cli@2.7.4
|
|
||||||
|
|
||||||
COPY package.json .
|
|
||||||
COPY package-lock.json .
|
|
||||||
|
|
||||||
RUN npm install
|
|
||||||
|
|
||||||
# This is so the installed node_modules will be up one directory
|
# This is so the installed node_modules will be up one directory
|
||||||
# from where a user mounts files, so that they don't accidentally mount
|
# from where a user mounts files, so that they don't accidentally mount
|
||||||
# their own node_modules from a different build
|
# their own node_modules from a different build
|
||||||
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
|
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
|
||||||
WORKDIR /spacy-io/website/
|
WORKDIR /home/node
|
||||||
|
COPY --chown=node package.json .
|
||||||
|
COPY --chown=node package-lock.json .
|
||||||
|
RUN npm install
|
||||||
|
|
||||||
|
WORKDIR /home/node/website/
|
||||||
|
|
|
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
|
||||||
`node_modules` folder**, since there are some dependencies that need to be built
|
`node_modules` folder**, since there are some dependencies that need to be built
|
||||||
for the image system. Rename it before using.
|
for the image system. Rename it before using.
|
||||||
|
|
||||||
```bash
|
First build the Docker image. This only needs to be done on the first run
|
||||||
docker run -it \
|
or when changes are made to `Dockerfile` or the website dependencies:
|
||||||
-v $(pwd):/spacy-io/website \
|
|
||||||
-p 8000:8000 \
|
|
||||||
ghcr.io/explosion/spacy-io \
|
|
||||||
gatsby develop -H 0.0.0.0
|
|
||||||
```
|
|
||||||
|
|
||||||
This will allow you to access the built website at http://0.0.0.0:8000/ in your
|
|
||||||
browser, and still edit code in your editor while having the site reflect those
|
|
||||||
changes.
|
|
||||||
|
|
||||||
**Note**: If you're working on a Mac with an M1 processor, you might see
|
|
||||||
segfault errors from `qemu` if you use the default image. To fix this use the
|
|
||||||
`arm64` tagged image in the `docker run` command
|
|
||||||
(ghcr.io/explosion/spacy-io:arm64).
|
|
||||||
|
|
||||||
### Building the Docker image
|
|
||||||
|
|
||||||
If you'd like to build the image locally, you can do so like this:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t spacy-io .
|
docker build -t spacy-io .
|
||||||
```
|
```
|
||||||
|
|
||||||
This will take some time, so if you want to use the prebuilt image you'll save a
|
You can then build and run the website with:
|
||||||
bit of time.
|
|
||||||
|
```bash
|
||||||
|
docker run -it \
|
||||||
|
--rm \
|
||||||
|
-v $(pwd):/home/node/website \
|
||||||
|
-p 3000:3000 \
|
||||||
|
spacy-io \
|
||||||
|
npm run dev -- -H 0.0.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
This will allow you to access the built website at http://0.0.0.0:3000/ in your
|
||||||
|
browser, and still edit code in your editor while having the site reflect those
|
||||||
|
changes.
|
||||||
|
|
||||||
## Project structure
|
## Project structure
|
||||||
|
|
||||||
|
|
|
@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
|
||||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||||
|
|
||||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
|
||||||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||||
and a machine learning model to pick the right candidate, given the local
|
and a machine learning model to pick the right candidate, given the local
|
||||||
context of the mention. `EntityLinker` defaults to using the
|
context of the mention. `EntityLinker` defaults to using the
|
||||||
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
|
[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
|
||||||
|
|
||||||
## Assigned Attributes {id="assigned-attributes"}
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
|
||||||
|
|
||||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||||
vector, which should be of length
|
vector, which should be of length
|
||||||
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
|
[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
|
||||||
|
|
||||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||||
identifiers and their prior probabilities. The entity identifiers should refer
|
identifiers and their prior probabilities. The entity identifiers should refer
|
||||||
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
|
to entities previously added with
|
||||||
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
|
[`add_entity`](/api/inmemorylookupkb#add_entity) or
|
||||||
|
[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
|
||||||
probabilities should not exceed 1. Note that an empty string can not be used as
|
probabilities should not exceed 1. Note that an empty string can not be used as
|
||||||
alias.
|
alias.
|
||||||
|
|
||||||
|
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||||
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
||||||
|
|
||||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||||
|
|
||||||
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
|
Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
|
||||||
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
||||||
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
||||||
parameter `candidates_batch_size` is greater or equal than 1.
|
parameter `candidates_batch_size` is greater or equal than 1.
|
||||||
|
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
|
||||||
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
|
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
|
||||||
|
|
||||||
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
|
Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
|
||||||
number of entity IDs.
|
number of entity IDs.
|
||||||
|
|
||||||
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
|
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
||||||
implementation up to that point is available as `InMemoryLookupKB` from 3.5
|
implementation up to that point is available as
|
||||||
onwards.
|
[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -110,14 +110,15 @@ to you.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||||
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
|
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
|
||||||
more flexibility in customizing knowledge bases. Some of its methods were moved
|
allow more flexibility in customizing knowledge bases. Some of its methods were
|
||||||
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
|
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
|
||||||
being `get_alias_candidates()`. This method is now available as
|
one of those being `get_alias_candidates()`. This method is now available as
|
||||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||||
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
|
Note:
|
||||||
|
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
|
||||||
defaults to
|
defaults to
|
||||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||||
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
||||||
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
|
| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||||
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
||||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||||
|
|
|
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
|
||||||
allowed edit distance directly.
|
allowed edit distance directly.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Match lowercase with fuzzy matching (allows 2 edits)
|
# Match lowercase with fuzzy matching (allows 3 edits)
|
||||||
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||||
|
|
||||||
# Match custom attribute values with fuzzy matching (allows 2 edits)
|
# Match custom attribute values with fuzzy matching (allows 3 edits)
|
||||||
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||||
|
|
||||||
# Match with exact Levenshtein edit distance limits (allows 3 edits)
|
# Match with exact Levenshtein edit distance limits (allows 4 edits)
|
||||||
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
|
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
|
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
|
||||||
|
|
|
@ -684,10 +684,15 @@ If your pipeline includes
|
||||||
[custom components](/usage/processing-pipelines#custom-components), model
|
[custom components](/usage/processing-pipelines#custom-components), model
|
||||||
architectures or other [code](/usage/training#custom-code), those functions need
|
architectures or other [code](/usage/training#custom-code), those functions need
|
||||||
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
||||||
how to create the objects referenced in the config. The
|
how to create the objects referenced in the config. If you're loading your own
|
||||||
[`spacy package`](/api/cli#package) command lets you provide one or more paths
|
pipeline in Python, you can make custom components available just by importing
|
||||||
to Python files containing custom registered functions using the `--code`
|
the code that defines them before calling
|
||||||
argument.
|
[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
|
||||||
|
argument to CLI commands works.
|
||||||
|
|
||||||
|
With the [`spacy package`](/api/cli#package) command, you can provide one or
|
||||||
|
more paths to Python files containing custom registered functions using the
|
||||||
|
`--code` argument.
|
||||||
|
|
||||||
> #### \_\_init\_\_.py (excerpt)
|
> #### \_\_init\_\_.py (excerpt)
|
||||||
>
|
>
|
||||||
|
|
215
website/docs/usage/v3-5.mdx
Normal file
215
website/docs/usage/v3-5.mdx
Normal file
|
@ -0,0 +1,215 @@
|
||||||
|
---
|
||||||
|
title: What's New in v3.5
|
||||||
|
teaser: New features and how to upgrade
|
||||||
|
menu:
|
||||||
|
- ['New Features', 'features']
|
||||||
|
- ['Upgrading Notes', 'upgrading']
|
||||||
|
---
|
||||||
|
|
||||||
|
## New features {id="features",hidden="true"}
|
||||||
|
|
||||||
|
spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
|
||||||
|
`find-threshold`, adds fuzzy matching, provides improvements to our entity
|
||||||
|
linking functionality, and includes a range of language updates and bug fixes.
|
||||||
|
|
||||||
|
### New CLI commands {id="cli"}
|
||||||
|
|
||||||
|
#### apply CLI
|
||||||
|
|
||||||
|
The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
|
||||||
|
`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
|
||||||
|
`.spacy` file.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ spacy apply en_core_web_sm my_texts/ output.spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
#### benchmark CLI
|
||||||
|
|
||||||
|
The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
|
||||||
|
`evaluate` functionality with a wider range of profiling subcommands.
|
||||||
|
|
||||||
|
The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
|
||||||
|
`benchmark speed` CLI performs warmup rounds before measuring the speed in words
|
||||||
|
per second on batches of randomly shuffled documents from the provided data.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ spacy benchmark speed my_pipeline data.spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
The output is the mean performance using batches (`nlp.pipe`) with a 95%
|
||||||
|
confidence interval, e.g., profiling `en_core_web_sm` on CPU:
|
||||||
|
|
||||||
|
```none
|
||||||
|
Outliers: 2.0%, extreme outliers: 0.0%
|
||||||
|
Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### find-threshold CLI
|
||||||
|
|
||||||
|
The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
|
||||||
|
across threshold values from `0.0` to `1.0` and identifies the best threshold
|
||||||
|
for the provided score metric.
|
||||||
|
|
||||||
|
The following command runs 20 trials for the `spancat` component in
|
||||||
|
`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
|
||||||
|
`[components.spancat.threshold]` from `0.0` to `1.0`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
|
||||||
|
```
|
||||||
|
|
||||||
|
The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
|
||||||
|
custom components with thresholds that are applied while predicting or scoring.
|
||||||
|
|
||||||
|
### Fuzzy matching {id="fuzzy"}
|
||||||
|
|
||||||
|
New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
|
||||||
|
with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
|
||||||
|
distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
|
||||||
|
be used to specify the exact number of allowed edits.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Match lowercase with fuzzy matching (allows up to 3 edits)
|
||||||
|
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||||
|
|
||||||
|
# Match custom attribute values with fuzzy matching (allows up to 3 edits)
|
||||||
|
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||||
|
|
||||||
|
# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
|
||||||
|
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
|
||||||
|
edit distance, so a transposition like `teh` for `the` counts as two edits, one
|
||||||
|
insertion and one deletion.
|
||||||
|
|
||||||
|
If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
|
||||||
|
custom method to the `Matcher` or as a config option for an entity ruler and
|
||||||
|
span ruler.
|
||||||
|
|
||||||
|
### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
|
||||||
|
|
||||||
|
The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
|
||||||
|
`NOT_IN`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
|
||||||
|
pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Entity linking generalization {id="el"}
|
||||||
|
|
||||||
|
The knowledge base used for entity linking is now easier to customize and has a
|
||||||
|
new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
|
||||||
|
|
||||||
|
### Additional features and improvements {id="additional-features-and-improvements"}
|
||||||
|
|
||||||
|
- Language updates:
|
||||||
|
- Extended support for Slovenian
|
||||||
|
- Fixed lookup fallback for French and Catalan lemmatizers
|
||||||
|
- Switch Russian and Ukrainian lemmatizers to `pymorphy3`
|
||||||
|
- Support for editorial punctuation in Ancient Greek
|
||||||
|
- Update to Russian tokenizer exceptions
|
||||||
|
- Small fix for Dutch stop words
|
||||||
|
- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
|
||||||
|
- New `spacy.ConsoleLogger.v3` with expanded progress
|
||||||
|
[tracking](/api/top-level#ConsoleLogger).
|
||||||
|
- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
|
||||||
|
`spacy.textcat_multilabel_scorer.v2`.
|
||||||
|
- Updates so that downstream components can train properly on a frozen `tok2vec`
|
||||||
|
or `transformer` layer.
|
||||||
|
- Allow interpolation of variables in directory names in projects.
|
||||||
|
- Support for local file system [remotes](/usage/projects#remote) for projects.
|
||||||
|
- Improve UX around `displacy.serve` when the default port is in use.
|
||||||
|
- Optional `before_update` callback that is invoked at the start of each
|
||||||
|
[training step](/api/data-formats#config-training).
|
||||||
|
- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
|
||||||
|
`Span` objects.
|
||||||
|
- Patch a
|
||||||
|
[security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
|
||||||
|
extracting tar files.
|
||||||
|
- Add equality definition for `Vectors`.
|
||||||
|
- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
|
||||||
|
`vectors`.
|
||||||
|
- Correctly handle missing annotations in the edit tree lemmatizer.
|
||||||
|
|
||||||
|
### Trained pipeline updates {id="pipelines"}
|
||||||
|
|
||||||
|
- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
|
||||||
|
`morphologizer` components to improve tagging of non-whitespace vs. whitespace
|
||||||
|
tokens.
|
||||||
|
- The transformer pipelines require `spacy-transformers` v1.2, which uses the
|
||||||
|
exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
|
||||||
|
alignment from `spacy-alignments`. For all trained pipelines except
|
||||||
|
`ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
|
||||||
|
may be slightly different. More details about the `spacy-transformers` changes
|
||||||
|
in the
|
||||||
|
[v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
|
||||||
|
|
||||||
|
## Notes about upgrading from v3.4 {id="upgrading"}
|
||||||
|
|
||||||
|
### Validation of textcat values {id="textcat-validation"}
|
||||||
|
|
||||||
|
An error is now raised when unsupported values are given as input to train a
|
||||||
|
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
||||||
|
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
||||||
|
|
||||||
|
### Updated scorers for tokenization and textcat {id="scores"}
|
||||||
|
|
||||||
|
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
||||||
|
`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
|
||||||
|
your tokenization performance has not changed from v3.4.
|
||||||
|
|
||||||
|
For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
|
||||||
|
|
||||||
|
- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
|
||||||
|
slightly in v3.5 even though the underlying predictions are unchanged
|
||||||
|
- report the performance of only the **final** `textcat` or `textcat_multilabel`
|
||||||
|
component in the pipeline by default
|
||||||
|
- allow custom scorers to be used to score multiple `textcat` and
|
||||||
|
`textcat_multilabel` components with `Scorer.score_cats` by restricting the
|
||||||
|
evaluation to the component's provided labels
|
||||||
|
|
||||||
|
### Pipeline package version compatibility {id="version-compat"}
|
||||||
|
|
||||||
|
> #### Using legacy implementations
|
||||||
|
>
|
||||||
|
> In spaCy v3, you'll still be able to load and reference legacy implementations
|
||||||
|
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
|
||||||
|
> components or architectures change and newer versions are available in the
|
||||||
|
> core library.
|
||||||
|
|
||||||
|
When you're loading a pipeline package trained with an earlier version of spaCy
|
||||||
|
v3, you will see a warning telling you that the pipeline may be incompatible.
|
||||||
|
This doesn't necessarily have to be true, but we recommend running your
|
||||||
|
pipelines against your test suite or evaluation data to make sure there are no
|
||||||
|
unexpected results.
|
||||||
|
|
||||||
|
If you're using one of the [trained pipelines](/models) we provide, you should
|
||||||
|
run [`spacy download`](/api/cli#download) to update to the latest version. To
|
||||||
|
see an overview of all installed packages and their compatibility, you can run
|
||||||
|
[`spacy validate`](/api/cli#validate).
|
||||||
|
|
||||||
|
If you've trained your own custom pipeline and you've confirmed that it's still
|
||||||
|
working as expected, you can update the spaCy version requirements in the
|
||||||
|
[`meta.json`](/api/data-formats#meta):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- "spacy_version": ">=3.4.0,<3.5.0",
|
||||||
|
+ "spacy_version": ">=3.4.0,<3.6.0",
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating v3.4 configs
|
||||||
|
|
||||||
|
To update a config from spaCy v3.4 with the new v3.5 settings, run
|
||||||
|
[`init fill-config`](/api/cli#init-fill-config):
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
In many cases ([`spacy train`](/api/cli#train),
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
|
||||||
|
automatically, but you'll need to fill in the new settings to run
|
||||||
|
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
|
|
@ -13,7 +13,8 @@
|
||||||
{ "text": "New in v3.1", "url": "/usage/v3-1" },
|
{ "text": "New in v3.1", "url": "/usage/v3-1" },
|
||||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
||||||
{ "text": "New in v3.4", "url": "/usage/v3-4" }
|
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
||||||
|
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -129,6 +130,7 @@
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Attributes", "url": "/api/attributes" },
|
{ "text": "Attributes", "url": "/api/attributes" },
|
||||||
{ "text": "Corpus", "url": "/api/corpus" },
|
{ "text": "Corpus", "url": "/api/corpus" },
|
||||||
|
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
|
||||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||||
{ "text": "Lookups", "url": "/api/lookups" },
|
{ "text": "Lookups", "url": "/api/lookups" },
|
||||||
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
||||||
|
|
|
@ -89,7 +89,7 @@ const Landing = () => {
|
||||||
</LandingCard>
|
</LandingCard>
|
||||||
|
|
||||||
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
|
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
|
||||||
In the five years since its release, spaCy has become an industry standard with
|
Since its release in 2015, spaCy has become an industry standard with
|
||||||
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
|
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
|
||||||
learning stack and build custom components and workflows.
|
learning stack and build custom components and workflows.
|
||||||
</LandingCard>
|
</LandingCard>
|
||||||
|
|
|
@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
|
||||||
import siteMetadata from '../../meta/site.json'
|
import siteMetadata from '../../meta/site.json'
|
||||||
import Head from 'next/head'
|
import Head from 'next/head'
|
||||||
|
|
||||||
|
import { siteUrl } from '../../meta/dynamicMeta.mjs'
|
||||||
|
|
||||||
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
|
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
|
||||||
if (sectionTitle && title) {
|
if (sectionTitle && title) {
|
||||||
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
|
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
|
||||||
|
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
|
||||||
if (legacy) return socialImageLegacy
|
if (legacy) return socialImageLegacy
|
||||||
if (section === 'api') return socialImageApi
|
if (section === 'api') return socialImageApi
|
||||||
if (section === 'universe') return socialImageUniverse
|
if (section === 'universe') return socialImageUniverse
|
||||||
return socialImageDefault
|
return `${siteUrl}${socialImageDefault.src}`
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function SEO({
|
export default function SEO({
|
||||||
|
@ -46,7 +48,7 @@ export default function SEO({
|
||||||
nightly,
|
nightly,
|
||||||
legacy
|
legacy
|
||||||
)
|
)
|
||||||
const socialImage = getImage(section, nightly, legacy).src
|
const socialImage = getImage(section, nightly, legacy)
|
||||||
const meta = [
|
const meta = [
|
||||||
{
|
{
|
||||||
name: 'description',
|
name: 'description',
|
||||||
|
|
|
@ -20,6 +20,10 @@
|
||||||
display: inline-block
|
display: inline-block
|
||||||
margin-bottom: var(--spacing-sm)
|
margin-bottom: var(--spacing-sm)
|
||||||
|
|
||||||
|
.ol, .ul
|
||||||
|
margin-top: var(--spacing-xs)
|
||||||
|
margin-bottom: var(--spacing-xs)
|
||||||
|
|
||||||
&:before
|
&:before
|
||||||
content: '\25CF'
|
content: '\25CF'
|
||||||
position: relative
|
position: relative
|
||||||
|
|
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="/usage/v3-4" hidden>
|
<Link to="/usage/v3-5" hidden>
|
||||||
<strong>💥 Out now:</strong> spaCy v3.4
|
<strong>💥 Out now:</strong> spaCy v3.5
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user