mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-30 10:00:04 +03:00
Merge branch 'master' into feature/etl
This commit is contained in:
commit
5a08596f92
44
.github/azure-steps.yml
vendored
44
.github/azure-steps.yml
vendored
|
@ -52,17 +52,17 @@ steps:
|
|||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
# - script: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# displayName: 'Test download CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# displayName: 'Test no warnings on load (#11713)'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
|
@ -86,17 +86,17 @@ steps:
|
|||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# displayName: 'Test assemble CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# displayName: 'Test assemble CLI vectors warning'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
|
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
website/public/
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
|
|
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.4 out now!**
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
|
|
@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
|
|
|
@ -63,7 +63,7 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
|
|
@ -106,9 +106,7 @@ def serve(
|
|||
|
||||
if is_in_jupyter():
|
||||
warnings.warn(Warnings.W011)
|
||||
render(
|
||||
docs, style=style, page=page, minify=minify, options=options, manual=manual
|
||||
)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server(host, port, app)
|
||||
print(f"\nUsing the '{style}' visualizer")
|
||||
print(f"Serving on http://{host}:{port} ...\n")
|
||||
|
|
|
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb_in_memory
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
|
|
|
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
|||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 20% of the pattern string length
|
||||
# to 30% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
|
|
@ -5,8 +5,12 @@ from ..vocab import Vocab
|
|||
from ..tokens import Doc, Span
|
||||
|
||||
class Matcher:
|
||||
def __init__(self, vocab: Vocab, validate: bool = ...,
|
||||
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
validate: bool = ...,
|
||||
fuzzy_compare: Callable[[str, str, int], bool] = ...,
|
||||
) -> None: ...
|
||||
def __reduce__(self) -> Any: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, key: str) -> bool: ...
|
||||
|
|
|
@ -5,8 +5,8 @@ from itertools import islice
|
|||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||
from thinc.types import Floats2d, Ints2d
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
|
@ -20,6 +20,10 @@ from ..vocab import Vocab
|
|||
from .. import util
|
||||
|
||||
|
||||
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||
TOP_K_GUARDRAIL = 20
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
self.numpy_ops = NumpyOps()
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
|
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
if self.top_k == 1:
|
||||
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||
else:
|
||||
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||
# be used for other purposes, and with very large values of *top_k* the method
|
||||
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||
# instead.
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
|
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
guesses = scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = doc_scores.argmax(axis=1)
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||
if self.trees.apply(tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(tree_id)
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||
guesses = []
|
||||
top_k = min(self.top_k, len(self.labels))
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
for _ in range(top_k):
|
||||
candidate = int(doc_scores[i].argmax())
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(candidate_tree_id)
|
||||
break
|
||||
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
|
|
|
@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
|
|||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
||||
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
|
||||
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
|
||||
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
|
||||
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
|
||||
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
|
||||
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
|
||||
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
|
||||
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
|
||||
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
|
||||
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy1"
|
||||
)
|
||||
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy2"
|
||||
)
|
||||
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy3"
|
||||
)
|
||||
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy4"
|
||||
)
|
||||
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy5"
|
||||
)
|
||||
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy6"
|
||||
)
|
||||
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy7"
|
||||
)
|
||||
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy8"
|
||||
)
|
||||
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy9"
|
||||
)
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
|
|
@ -101,14 +101,15 @@ def test_initialize_from_labels():
|
|||
}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_no_data(top_k):
|
||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||
TEXTCAT_DATA = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||
]
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp.add_pipe("textcat")
|
||||
|
||||
train_examples = []
|
||||
|
@ -119,10 +120,11 @@ def test_no_data():
|
|||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_incomplete_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_incomplete_data(top_k):
|
||||
# Test that the lemmatizer works with incomplete information
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in PARTIAL_DATA:
|
||||
|
@ -154,9 +156,10 @@ def test_incomplete_data():
|
|||
assert xp.count_nonzero(dX[1][1]) == 0
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_overfitting_IO(top_k):
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
|
@ -189,7 +192,7 @@ def test_overfitting_IO():
|
|||
# Check model after a {to,from}_bytes roundtrip
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp3 = English()
|
||||
nlp3.add_pipe("trainable_lemmatizer")
|
||||
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp3.from_bytes(nlp_bytes)
|
||||
doc3 = nlp3(test_text)
|
||||
assert doc3[0].lemma_ == "she"
|
||||
|
|
|
@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
|
|||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||
def test_download_compatibility():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
@ -629,7 +628,6 @@ def test_download_compatibility():
|
|||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||
def test_validate_compatibility_table():
|
||||
spec = SpecifierSet("==" + about.__version__)
|
||||
spec.prereleases = False
|
||||
|
|
9
website/.dockerignore
Normal file
9
website/.dockerignore
Normal file
|
@ -0,0 +1,9 @@
|
|||
.cache/
|
||||
.next/
|
||||
public/
|
||||
node_modules
|
||||
.npm
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
2
website/.gitignore
vendored
2
website/.gitignore
vendored
|
@ -1,5 +1,7 @@
|
|||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
quickstart-training-generator.js
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
|
|
|
@ -1,16 +1,14 @@
|
|||
FROM node:11.15.0
|
||||
FROM node:18
|
||||
|
||||
WORKDIR /spacy-io
|
||||
|
||||
RUN npm install -g gatsby-cli@2.7.4
|
||||
|
||||
COPY package.json .
|
||||
COPY package-lock.json .
|
||||
|
||||
RUN npm install
|
||||
USER node
|
||||
|
||||
# This is so the installed node_modules will be up one directory
|
||||
# from where a user mounts files, so that they don't accidentally mount
|
||||
# their own node_modules from a different build
|
||||
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
|
||||
WORKDIR /spacy-io/website/
|
||||
WORKDIR /home/node
|
||||
COPY --chown=node package.json .
|
||||
COPY --chown=node package-lock.json .
|
||||
RUN npm install
|
||||
|
||||
WORKDIR /home/node/website/
|
||||
|
|
|
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
|
|||
`node_modules` folder**, since there are some dependencies that need to be built
|
||||
for the image system. Rename it before using.
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
-v $(pwd):/spacy-io/website \
|
||||
-p 8000:8000 \
|
||||
ghcr.io/explosion/spacy-io \
|
||||
gatsby develop -H 0.0.0.0
|
||||
```
|
||||
|
||||
This will allow you to access the built website at http://0.0.0.0:8000/ in your
|
||||
browser, and still edit code in your editor while having the site reflect those
|
||||
changes.
|
||||
|
||||
**Note**: If you're working on a Mac with an M1 processor, you might see
|
||||
segfault errors from `qemu` if you use the default image. To fix this use the
|
||||
`arm64` tagged image in the `docker run` command
|
||||
(ghcr.io/explosion/spacy-io:arm64).
|
||||
|
||||
### Building the Docker image
|
||||
|
||||
If you'd like to build the image locally, you can do so like this:
|
||||
First build the Docker image. This only needs to be done on the first run
|
||||
or when changes are made to `Dockerfile` or the website dependencies:
|
||||
|
||||
```bash
|
||||
docker build -t spacy-io .
|
||||
```
|
||||
|
||||
This will take some time, so if you want to use the prebuilt image you'll save a
|
||||
bit of time.
|
||||
You can then build and run the website with:
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--rm \
|
||||
-v $(pwd):/home/node/website \
|
||||
-p 3000:3000 \
|
||||
spacy-io \
|
||||
npm run dev -- -H 0.0.0.0
|
||||
```
|
||||
|
||||
This will allow you to access the built website at http://0.0.0.0:3000/ in your
|
||||
browser, and still edit code in your editor while having the site reflect those
|
||||
changes.
|
||||
|
||||
## Project structure
|
||||
|
||||
|
|
|
@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
|
|||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
| Name | Description |
|
||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
|
||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
|
|||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||
and a machine learning model to pick the right candidate, given the local
|
||||
context of the mention. `EntityLinker` defaults to using the
|
||||
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
|
||||
|
||||
## Assigned Attributes {id="assigned-attributes"}
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
|
|||
|
||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||
vector, which should be of length
|
||||
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
|
||||
[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
|
|||
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||
identifiers and their prior probabilities. The entity identifiers should refer
|
||||
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
|
||||
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
|
||||
to entities previously added with
|
||||
[`add_entity`](/api/inmemorylookupkb#add_entity) or
|
||||
[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
|
||||
probabilities should not exceed 1. Note that an empty string can not be used as
|
||||
alias.
|
||||
|
||||
|
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
|
|||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
|||
|
||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||
|
||||
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
|
||||
Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
|
||||
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
||||
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
||||
parameter `candidates_batch_size` is greater or equal than 1.
|
||||
|
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
|||
|
||||
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
|
||||
|
||||
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
|
||||
Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
|
||||
number of entity IDs.
|
||||
|
||||
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
|
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
|
|||
<Infobox variant="warning">
|
||||
|
||||
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
||||
implementation up to that point is available as `InMemoryLookupKB` from 3.5
|
||||
onwards.
|
||||
implementation up to that point is available as
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -110,14 +110,15 @@ to you.
|
|||
</Infobox>
|
||||
|
||||
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
|
||||
more flexibility in customizing knowledge bases. Some of its methods were moved
|
||||
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
|
||||
being `get_alias_candidates()`. This method is now available as
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
|
||||
allow more flexibility in customizing knowledge bases. Some of its methods were
|
||||
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
|
||||
one of those being `get_alias_candidates()`. This method is now available as
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
Note:
|
||||
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
|
||||
defaults to
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
|||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
||||
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||
| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||
|
|
|
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
|
|||
allowed edit distance directly.
|
||||
|
||||
```python
|
||||
# Match lowercase with fuzzy matching (allows 2 edits)
|
||||
# Match lowercase with fuzzy matching (allows 3 edits)
|
||||
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||
|
||||
# Match custom attribute values with fuzzy matching (allows 2 edits)
|
||||
# Match custom attribute values with fuzzy matching (allows 3 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||
|
||||
# Match with exact Levenshtein edit distance limits (allows 3 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
|
||||
# Match with exact Levenshtein edit distance limits (allows 4 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
|
||||
```
|
||||
|
||||
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
|
||||
|
|
|
@ -684,10 +684,15 @@ If your pipeline includes
|
|||
[custom components](/usage/processing-pipelines#custom-components), model
|
||||
architectures or other [code](/usage/training#custom-code), those functions need
|
||||
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
||||
how to create the objects referenced in the config. The
|
||||
[`spacy package`](/api/cli#package) command lets you provide one or more paths
|
||||
to Python files containing custom registered functions using the `--code`
|
||||
argument.
|
||||
how to create the objects referenced in the config. If you're loading your own
|
||||
pipeline in Python, you can make custom components available just by importing
|
||||
the code that defines them before calling
|
||||
[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
|
||||
argument to CLI commands works.
|
||||
|
||||
With the [`spacy package`](/api/cli#package) command, you can provide one or
|
||||
more paths to Python files containing custom registered functions using the
|
||||
`--code` argument.
|
||||
|
||||
> #### \_\_init\_\_.py (excerpt)
|
||||
>
|
||||
|
|
215
website/docs/usage/v3-5.mdx
Normal file
215
website/docs/usage/v3-5.mdx
Normal file
|
@ -0,0 +1,215 @@
|
|||
---
|
||||
title: What's New in v3.5
|
||||
teaser: New features and how to upgrade
|
||||
menu:
|
||||
- ['New Features', 'features']
|
||||
- ['Upgrading Notes', 'upgrading']
|
||||
---
|
||||
|
||||
## New features {id="features",hidden="true"}
|
||||
|
||||
spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
|
||||
`find-threshold`, adds fuzzy matching, provides improvements to our entity
|
||||
linking functionality, and includes a range of language updates and bug fixes.
|
||||
|
||||
### New CLI commands {id="cli"}
|
||||
|
||||
#### apply CLI
|
||||
|
||||
The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
|
||||
`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
|
||||
`.spacy` file.
|
||||
|
||||
```bash
|
||||
$ spacy apply en_core_web_sm my_texts/ output.spacy
|
||||
```
|
||||
|
||||
#### benchmark CLI
|
||||
|
||||
The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
|
||||
`evaluate` functionality with a wider range of profiling subcommands.
|
||||
|
||||
The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
|
||||
`benchmark speed` CLI performs warmup rounds before measuring the speed in words
|
||||
per second on batches of randomly shuffled documents from the provided data.
|
||||
|
||||
```bash
|
||||
$ spacy benchmark speed my_pipeline data.spacy
|
||||
```
|
||||
|
||||
The output is the mean performance using batches (`nlp.pipe`) with a 95%
|
||||
confidence interval, e.g., profiling `en_core_web_sm` on CPU:
|
||||
|
||||
```none
|
||||
Outliers: 2.0%, extreme outliers: 0.0%
|
||||
Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
|
||||
```
|
||||
|
||||
#### find-threshold CLI
|
||||
|
||||
The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
|
||||
across threshold values from `0.0` to `1.0` and identifies the best threshold
|
||||
for the provided score metric.
|
||||
|
||||
The following command runs 20 trials for the `spancat` component in
|
||||
`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
|
||||
`[components.spancat.threshold]` from `0.0` to `1.0`:
|
||||
|
||||
```bash
|
||||
$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
|
||||
```
|
||||
|
||||
The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
|
||||
custom components with thresholds that are applied while predicting or scoring.
|
||||
|
||||
### Fuzzy matching {id="fuzzy"}
|
||||
|
||||
New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
|
||||
with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
|
||||
distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
|
||||
be used to specify the exact number of allowed edits.
|
||||
|
||||
```python
|
||||
# Match lowercase with fuzzy matching (allows up to 3 edits)
|
||||
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||
|
||||
# Match custom attribute values with fuzzy matching (allows up to 3 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||
|
||||
# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
|
||||
```
|
||||
|
||||
Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
|
||||
edit distance, so a transposition like `teh` for `the` counts as two edits, one
|
||||
insertion and one deletion.
|
||||
|
||||
If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
|
||||
custom method to the `Matcher` or as a config option for an entity ruler and
|
||||
span ruler.
|
||||
|
||||
### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
|
||||
|
||||
The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
|
||||
`NOT_IN`:
|
||||
|
||||
```python
|
||||
pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
|
||||
pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
|
||||
```
|
||||
|
||||
### Entity linking generalization {id="el"}
|
||||
|
||||
The knowledge base used for entity linking is now easier to customize and has a
|
||||
new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
|
||||
|
||||
### Additional features and improvements {id="additional-features-and-improvements"}
|
||||
|
||||
- Language updates:
|
||||
- Extended support for Slovenian
|
||||
- Fixed lookup fallback for French and Catalan lemmatizers
|
||||
- Switch Russian and Ukrainian lemmatizers to `pymorphy3`
|
||||
- Support for editorial punctuation in Ancient Greek
|
||||
- Update to Russian tokenizer exceptions
|
||||
- Small fix for Dutch stop words
|
||||
- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
|
||||
- New `spacy.ConsoleLogger.v3` with expanded progress
|
||||
[tracking](/api/top-level#ConsoleLogger).
|
||||
- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
|
||||
`spacy.textcat_multilabel_scorer.v2`.
|
||||
- Updates so that downstream components can train properly on a frozen `tok2vec`
|
||||
or `transformer` layer.
|
||||
- Allow interpolation of variables in directory names in projects.
|
||||
- Support for local file system [remotes](/usage/projects#remote) for projects.
|
||||
- Improve UX around `displacy.serve` when the default port is in use.
|
||||
- Optional `before_update` callback that is invoked at the start of each
|
||||
[training step](/api/data-formats#config-training).
|
||||
- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
|
||||
`Span` objects.
|
||||
- Patch a
|
||||
[security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
|
||||
extracting tar files.
|
||||
- Add equality definition for `Vectors`.
|
||||
- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
|
||||
`vectors`.
|
||||
- Correctly handle missing annotations in the edit tree lemmatizer.
|
||||
|
||||
### Trained pipeline updates {id="pipelines"}
|
||||
|
||||
- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
|
||||
`morphologizer` components to improve tagging of non-whitespace vs. whitespace
|
||||
tokens.
|
||||
- The transformer pipelines require `spacy-transformers` v1.2, which uses the
|
||||
exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
|
||||
alignment from `spacy-alignments`. For all trained pipelines except
|
||||
`ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
|
||||
may be slightly different. More details about the `spacy-transformers` changes
|
||||
in the
|
||||
[v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
|
||||
|
||||
## Notes about upgrading from v3.4 {id="upgrading"}
|
||||
|
||||
### Validation of textcat values {id="textcat-validation"}
|
||||
|
||||
An error is now raised when unsupported values are given as input to train a
|
||||
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
||||
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
||||
|
||||
### Updated scorers for tokenization and textcat {id="scores"}
|
||||
|
||||
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
||||
`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
|
||||
your tokenization performance has not changed from v3.4.
|
||||
|
||||
For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
|
||||
|
||||
- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
|
||||
slightly in v3.5 even though the underlying predictions are unchanged
|
||||
- report the performance of only the **final** `textcat` or `textcat_multilabel`
|
||||
component in the pipeline by default
|
||||
- allow custom scorers to be used to score multiple `textcat` and
|
||||
`textcat_multilabel` components with `Scorer.score_cats` by restricting the
|
||||
evaluation to the component's provided labels
|
||||
|
||||
### Pipeline package version compatibility {id="version-compat"}
|
||||
|
||||
> #### Using legacy implementations
|
||||
>
|
||||
> In spaCy v3, you'll still be able to load and reference legacy implementations
|
||||
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
|
||||
> components or architectures change and newer versions are available in the
|
||||
> core library.
|
||||
|
||||
When you're loading a pipeline package trained with an earlier version of spaCy
|
||||
v3, you will see a warning telling you that the pipeline may be incompatible.
|
||||
This doesn't necessarily have to be true, but we recommend running your
|
||||
pipelines against your test suite or evaluation data to make sure there are no
|
||||
unexpected results.
|
||||
|
||||
If you're using one of the [trained pipelines](/models) we provide, you should
|
||||
run [`spacy download`](/api/cli#download) to update to the latest version. To
|
||||
see an overview of all installed packages and their compatibility, you can run
|
||||
[`spacy validate`](/api/cli#validate).
|
||||
|
||||
If you've trained your own custom pipeline and you've confirmed that it's still
|
||||
working as expected, you can update the spaCy version requirements in the
|
||||
[`meta.json`](/api/data-formats#meta):
|
||||
|
||||
```diff
|
||||
- "spacy_version": ">=3.4.0,<3.5.0",
|
||||
+ "spacy_version": ">=3.4.0,<3.6.0",
|
||||
```
|
||||
|
||||
### Updating v3.4 configs
|
||||
|
||||
To update a config from spaCy v3.4 with the new v3.5 settings, run
|
||||
[`init fill-config`](/api/cli#init-fill-config):
|
||||
|
||||
```cli
|
||||
$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
|
||||
```
|
||||
|
||||
In many cases ([`spacy train`](/api/cli#train),
|
||||
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
|
||||
automatically, but you'll need to fill in the new settings to run
|
||||
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
|
|
@ -13,7 +13,8 @@
|
|||
{ "text": "New in v3.1", "url": "/usage/v3-1" },
|
||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
||||
{ "text": "New in v3.4", "url": "/usage/v3-4" }
|
||||
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
||||
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -129,6 +130,7 @@
|
|||
"items": [
|
||||
{ "text": "Attributes", "url": "/api/attributes" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" },
|
||||
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
||||
|
|
|
@ -89,7 +89,7 @@ const Landing = () => {
|
|||
</LandingCard>
|
||||
|
||||
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
|
||||
In the five years since its release, spaCy has become an industry standard with
|
||||
Since its release in 2015, spaCy has become an industry standard with
|
||||
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
|
||||
learning stack and build custom components and workflows.
|
||||
</LandingCard>
|
||||
|
|
|
@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
|
|||
import siteMetadata from '../../meta/site.json'
|
||||
import Head from 'next/head'
|
||||
|
||||
import { siteUrl } from '../../meta/dynamicMeta.mjs'
|
||||
|
||||
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
|
||||
if (sectionTitle && title) {
|
||||
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
|
||||
|
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
|
|||
if (legacy) return socialImageLegacy
|
||||
if (section === 'api') return socialImageApi
|
||||
if (section === 'universe') return socialImageUniverse
|
||||
return socialImageDefault
|
||||
return `${siteUrl}${socialImageDefault.src}`
|
||||
}
|
||||
|
||||
export default function SEO({
|
||||
|
@ -46,7 +48,7 @@ export default function SEO({
|
|||
nightly,
|
||||
legacy
|
||||
)
|
||||
const socialImage = getImage(section, nightly, legacy).src
|
||||
const socialImage = getImage(section, nightly, legacy)
|
||||
const meta = [
|
||||
{
|
||||
name: 'description',
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
display: inline-block
|
||||
margin-bottom: var(--spacing-sm)
|
||||
|
||||
.ol, .ul
|
||||
margin-top: var(--spacing-xs)
|
||||
margin-bottom: var(--spacing-xs)
|
||||
|
||||
&:before
|
||||
content: '\25CF'
|
||||
position: relative
|
||||
|
|
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
}
|
||||
|
||||
const navAlert = (
|
||||
<Link to="/usage/v3-4" hidden>
|
||||
<strong>💥 Out now:</strong> spaCy v3.4
|
||||
<Link to="/usage/v3-5" hidden>
|
||||
<strong>💥 Out now:</strong> spaCy v3.5
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user