mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 12:50:20 +03:00
Merge branch 'explosion:master' into rapidfuzz
This commit is contained in:
commit
b690c9120c
44
.github/azure-steps.yml
vendored
44
.github/azure-steps.yml
vendored
|
@ -52,17 +52,17 @@ steps:
|
||||||
python -W error -c "import spacy"
|
python -W error -c "import spacy"
|
||||||
displayName: "Test import"
|
displayName: "Test import"
|
||||||
|
|
||||||
- script: |
|
# - script: |
|
||||||
python -m spacy download ca_core_news_sm
|
# python -m spacy download ca_core_news_sm
|
||||||
python -m spacy download ca_core_news_md
|
# python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
displayName: 'Test download CLI'
|
# displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
#
|
||||||
- script: |
|
# - script: |
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
displayName: 'Test no warnings on load (#11713)'
|
# displayName: 'Test no warnings on load (#11713)'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
@ -86,17 +86,17 @@ steps:
|
||||||
displayName: 'Test train CLI'
|
displayName: 'Test train CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
# - script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
displayName: 'Test assemble CLI'
|
# displayName: 'Test assemble CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
#
|
||||||
- script: |
|
# - script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
# displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
||||||
action:
|
action:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v3
|
- uses: dessant/lock-threads@v4
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
issue-comment: >
|
issue-comment: >
|
||||||
This thread has been automatically locked since there
|
This thread has been automatically locked since there
|
||||||
has not been any recent activity after it was closed.
|
has not been any recent activity after it was closed.
|
||||||
Please open a new issue for related bugs.
|
Please open a new issue for related bugs.
|
||||||
|
|
|
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
|
||||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the MIT license.
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💫 **Version 3.4 out now!**
|
💫 **Version 3.4 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# We're only running one platform per Python version to speed up builds
|
# We're only running one platform per Python version to speed up builds
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: "ubuntu-latest"
|
imageName: "ubuntu-20.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "windows-latest"
|
# imageName: "windows-latest"
|
||||||
|
@ -50,7 +50,7 @@ jobs:
|
||||||
# imageName: "macos-latest"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python37Linux:
|
# Python37Linux:
|
||||||
# imageName: "ubuntu-latest"
|
# imageName: "ubuntu-20.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "windows-latest"
|
imageName: "windows-latest"
|
||||||
|
|
|
@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.4.2"
|
__version__ = "3.5.0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
SDIST_SUFFIX = ".tar.gz"
|
SDIST_SUFFIX = ".tar.gz"
|
||||||
|
@ -158,15 +158,15 @@ def load_project_config(
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
validate_project_version(config)
|
validate_project_version(config)
|
||||||
validate_project_commands(config)
|
validate_project_commands(config)
|
||||||
|
if interpolate:
|
||||||
|
err = f"{PROJECT_FILE} validation error"
|
||||||
|
with show_validation_error(title=err, hint_fill=False):
|
||||||
|
config = substitute_project_variables(config, overrides)
|
||||||
# Make sure directories defined in config exist
|
# Make sure directories defined in config exist
|
||||||
for subdir in config.get("directories", []):
|
for subdir in config.get("directories", []):
|
||||||
dir_path = path / subdir
|
dir_path = path / subdir
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
if interpolate:
|
|
||||||
err = f"{PROJECT_FILE} validation error"
|
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
|
||||||
config = substitute_project_variables(config, overrides)
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@ -331,7 +331,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
src (Path): The source path.
|
src (Path): The source path.
|
||||||
|
@ -339,13 +339,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""
|
"""
|
||||||
import smart_open
|
import smart_open
|
||||||
|
|
||||||
|
# Create parent directories for local paths
|
||||||
|
if isinstance(dest, Path):
|
||||||
|
if not dest.parent.exists():
|
||||||
|
dest.parent.mkdir(parents=True)
|
||||||
|
|
||||||
dest = str(dest)
|
dest = str(dest)
|
||||||
with smart_open.open(dest, mode="wb") as output_file:
|
with smart_open.open(dest, mode="wb") as output_file:
|
||||||
with src.open(mode="rb") as input_file:
|
with src.open(mode="rb") as input_file:
|
||||||
output_file.write(input_file.read())
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
def download_file(
|
||||||
|
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
||||||
|
) -> None:
|
||||||
"""Download a file using smart_open.
|
"""Download a file using smart_open.
|
||||||
|
|
||||||
url (str): The URL of the file.
|
url (str): The URL of the file.
|
||||||
|
@ -368,7 +375,7 @@ def ensure_pathy(path):
|
||||||
slow and annoying Google Cloud warning)."""
|
slow and annoying Google Cloud warning)."""
|
||||||
from pathy import Pathy # noqa: F811
|
from pathy import Pathy # noqa: F811
|
||||||
|
|
||||||
return Pathy(path)
|
return Pathy.fluid(path)
|
||||||
|
|
||||||
|
|
||||||
def git_checkout(
|
def git_checkout(
|
||||||
|
|
|
@ -13,6 +13,7 @@ from ._util import import_code, debug_cli, _format_number
|
||||||
from ..training import Example, remove_bilu_prefix
|
from ..training import Example, remove_bilu_prefix
|
||||||
from ..training.initialize import get_sourced_components
|
from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
from ..pipeline import TrainablePipe
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer, SpanCategorizer
|
from ..pipeline import Morphologizer, SpanCategorizer
|
||||||
|
@ -934,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
||||||
labels: Set[str] = set()
|
labels: Set[str] = set()
|
||||||
for pipe_name in pipe_names:
|
for pipe_name in pipe_names:
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
assert isinstance(pipe, TrainablePipe)
|
||||||
labels.update(pipe.labels)
|
labels.update(pipe.labels)
|
||||||
return labels
|
return labels
|
||||||
|
|
||||||
|
|
|
@ -5,15 +5,17 @@ import hashlib
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import tarfile
|
import tarfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
from .._util import ensure_pathy, make_tempdir
|
||||||
|
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
class RemoteStorage:
|
class RemoteStorage:
|
||||||
|
@ -28,7 +30,7 @@ class RemoteStorage:
|
||||||
self.url = ensure_pathy(url)
|
self.url = ensure_pathy(url)
|
||||||
self.compression = compression
|
self.compression = compression
|
||||||
|
|
||||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||||
"""Compress a file or directory within a project and upload it to a remote
|
"""Compress a file or directory within a project and upload it to a remote
|
||||||
storage. If an object exists at the full URL, nothing is done.
|
storage. If an object exists at the full URL, nothing is done.
|
||||||
|
|
||||||
|
@ -49,9 +51,7 @@ class RemoteStorage:
|
||||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||||
tar_file.add(str(loc), arcname=str(path))
|
tar_file.add(str(loc), arcname=str(path))
|
||||||
with tar_loc.open(mode="rb") as input_file:
|
upload_file(tar_loc, url)
|
||||||
with url.open(mode="wb") as output_file:
|
|
||||||
output_file.write(input_file.read())
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def pull(
|
def pull(
|
||||||
|
@ -60,7 +60,7 @@ class RemoteStorage:
|
||||||
*,
|
*,
|
||||||
command_hash: Optional[str] = None,
|
command_hash: Optional[str] = None,
|
||||||
content_hash: Optional[str] = None,
|
content_hash: Optional[str] = None,
|
||||||
) -> Optional["Pathy"]:
|
) -> Optional["FluidPath"]:
|
||||||
"""Retrieve a file from the remote cache. If the file already exists,
|
"""Retrieve a file from the remote cache. If the file already exists,
|
||||||
nothing is done.
|
nothing is done.
|
||||||
|
|
||||||
|
@ -110,25 +110,37 @@ class RemoteStorage:
|
||||||
*,
|
*,
|
||||||
command_hash: Optional[str] = None,
|
command_hash: Optional[str] = None,
|
||||||
content_hash: Optional[str] = None,
|
content_hash: Optional[str] = None,
|
||||||
) -> Optional["Pathy"]:
|
) -> Optional["FluidPath"]:
|
||||||
"""Find the best matching version of a file within the storage,
|
"""Find the best matching version of a file within the storage,
|
||||||
or `None` if no match can be found. If both the creation and content hash
|
or `None` if no match can be found. If both the creation and content hash
|
||||||
are specified, only exact matches will be returned. Otherwise, the most
|
are specified, only exact matches will be returned. Otherwise, the most
|
||||||
recent matching file is preferred.
|
recent matching file is preferred.
|
||||||
"""
|
"""
|
||||||
name = self.encode_name(str(path))
|
name = self.encode_name(str(path))
|
||||||
|
urls = []
|
||||||
if command_hash is not None and content_hash is not None:
|
if command_hash is not None and content_hash is not None:
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
url = self.url / name / command_hash / content_hash
|
||||||
urls = [url] if url.exists() else []
|
urls = [url] if url.exists() else []
|
||||||
elif command_hash is not None:
|
elif command_hash is not None:
|
||||||
urls = list((self.url / name / command_hash).iterdir())
|
if (self.url / name / command_hash).exists():
|
||||||
|
urls = list((self.url / name / command_hash).iterdir())
|
||||||
else:
|
else:
|
||||||
urls = list((self.url / name).iterdir())
|
if (self.url / name).exists():
|
||||||
if content_hash is not None:
|
for sub_dir in (self.url / name).iterdir():
|
||||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
urls.extend(sub_dir.iterdir())
|
||||||
|
if content_hash is not None:
|
||||||
|
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||||
|
if len(urls) >= 2:
|
||||||
|
try:
|
||||||
|
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||||
|
except Exception:
|
||||||
|
msg.warn(
|
||||||
|
"Unable to sort remote files by last modified. The file(s) "
|
||||||
|
"pulled from the cache may not be the most recent."
|
||||||
|
)
|
||||||
return urls[-1] if urls else None
|
return urls[-1] if urls else None
|
||||||
|
|
||||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||||
|
|
||||||
|
|
|
@ -101,8 +101,8 @@ def project_run(
|
||||||
if not (project_dir / dep).exists():
|
if not (project_dir / dep).exists():
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_exits = 1 if not dry else None
|
||||||
msg.fail(err, err_help, **err_kwargs)
|
msg.fail(err, err_help, exits=err_exits)
|
||||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
msg.divider(subcommand)
|
msg.divider(subcommand)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{# This is a template for training configs used for the quickstart widget in
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
the docs and the init config command. It encodes various best practices and
|
the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
|
|
|
@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
|
E079 = ("Error computing states in beam: number of predicted beams "
|
||||||
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||||
|
E080 = ("Duplicate state found in beam: {key}.")
|
||||||
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||||
|
"does not equal number of losses ({losses}).")
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
|
|
@ -43,8 +43,7 @@ from .lookups import load_lookups
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
PipeCallable = Callable[[Doc], Doc]
|
||||||
from .pipeline import Pipe # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
|
@ -181,7 +180,7 @@ class Language:
|
||||||
self.vocab: Vocab = vocab
|
self.vocab: Vocab = vocab
|
||||||
if self.lang is None:
|
if self.lang is None:
|
||||||
self.lang = self.vocab.lang
|
self.lang = self.vocab.lang
|
||||||
self._components: List[Tuple[str, "Pipe"]] = []
|
self._components: List[Tuple[str, PipeCallable]] = []
|
||||||
self._disabled: Set[str] = set()
|
self._disabled: Set[str] = set()
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
# Create the default tokenizer from the default config
|
# Create the default tokenizer from the default config
|
||||||
|
@ -303,7 +302,7 @@ class Language:
|
||||||
return SimpleFrozenList(names)
|
return SimpleFrozenList(names)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def components(self) -> List[Tuple[str, "Pipe"]]:
|
def components(self) -> List[Tuple[str, PipeCallable]]:
|
||||||
"""Get all (name, component) tuples in the pipeline, including the
|
"""Get all (name, component) tuples in the pipeline, including the
|
||||||
currently disabled components.
|
currently disabled components.
|
||||||
"""
|
"""
|
||||||
|
@ -322,12 +321,12 @@ class Language:
|
||||||
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pipeline(self) -> List[Tuple[str, "Pipe"]]:
|
def pipeline(self) -> List[Tuple[str, PipeCallable]]:
|
||||||
"""The processing pipeline consisting of (name, component) tuples. The
|
"""The processing pipeline consisting of (name, component) tuples. The
|
||||||
components are called on the Doc in order as it passes through the
|
components are called on the Doc in order as it passes through the
|
||||||
pipeline.
|
pipeline.
|
||||||
|
|
||||||
RETURNS (List[Tuple[str, Pipe]]): The pipeline.
|
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
|
||||||
"""
|
"""
|
||||||
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
||||||
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
||||||
|
@ -527,7 +526,7 @@ class Language:
|
||||||
assigns: Iterable[str] = SimpleFrozenList(),
|
assigns: Iterable[str] = SimpleFrozenList(),
|
||||||
requires: Iterable[str] = SimpleFrozenList(),
|
requires: Iterable[str] = SimpleFrozenList(),
|
||||||
retokenizes: bool = False,
|
retokenizes: bool = False,
|
||||||
func: Optional["Pipe"] = None,
|
func: Optional[PipeCallable] = None,
|
||||||
) -> Callable[..., Any]:
|
) -> Callable[..., Any]:
|
||||||
"""Register a new pipeline component. Can be used for stateless function
|
"""Register a new pipeline component. Can be used for stateless function
|
||||||
components that don't require a separate factory. Can be used as a
|
components that don't require a separate factory. Can be used as a
|
||||||
|
@ -542,7 +541,7 @@ class Language:
|
||||||
e.g. "token.ent_id". Used for pipeline analysis.
|
e.g. "token.ent_id". Used for pipeline analysis.
|
||||||
retokenizes (bool): Whether the component changes the tokenization.
|
retokenizes (bool): Whether the component changes the tokenization.
|
||||||
Used for pipeline analysis.
|
Used for pipeline analysis.
|
||||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#component
|
DOCS: https://spacy.io/api/language#component
|
||||||
"""
|
"""
|
||||||
|
@ -553,11 +552,11 @@ class Language:
|
||||||
raise ValueError(Errors.E853.format(name=name))
|
raise ValueError(Errors.E853.format(name=name))
|
||||||
component_name = name if name is not None else util.get_object_name(func)
|
component_name = name if name is not None else util.get_object_name(func)
|
||||||
|
|
||||||
def add_component(component_func: "Pipe") -> Callable:
|
def add_component(component_func: PipeCallable) -> Callable:
|
||||||
if isinstance(func, type): # function is a class
|
if isinstance(func, type): # function is a class
|
||||||
raise ValueError(Errors.E965.format(name=component_name))
|
raise ValueError(Errors.E965.format(name=component_name))
|
||||||
|
|
||||||
def factory_func(nlp, name: str) -> "Pipe":
|
def factory_func(nlp, name: str) -> PipeCallable:
|
||||||
return component_func
|
return component_func
|
||||||
|
|
||||||
internal_name = cls.get_factory_name(name)
|
internal_name = cls.get_factory_name(name)
|
||||||
|
@ -607,7 +606,7 @@ class Language:
|
||||||
print_pipe_analysis(analysis, keys=keys)
|
print_pipe_analysis(analysis, keys=keys)
|
||||||
return analysis
|
return analysis
|
||||||
|
|
||||||
def get_pipe(self, name: str) -> "Pipe":
|
def get_pipe(self, name: str) -> PipeCallable:
|
||||||
"""Get a pipeline component for a given component name.
|
"""Get a pipeline component for a given component name.
|
||||||
|
|
||||||
name (str): Name of pipeline component to get.
|
name (str): Name of pipeline component to get.
|
||||||
|
@ -628,7 +627,7 @@ class Language:
|
||||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
raw_config: Optional[Config] = None,
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> "Pipe":
|
) -> PipeCallable:
|
||||||
"""Create a pipeline component. Mostly used internally. To create and
|
"""Create a pipeline component. Mostly used internally. To create and
|
||||||
add a component to the pipeline, you can use nlp.add_pipe.
|
add a component to the pipeline, you can use nlp.add_pipe.
|
||||||
|
|
||||||
|
@ -640,7 +639,7 @@ class Language:
|
||||||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Pipe): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#create_pipe
|
DOCS: https://spacy.io/api/language#create_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -695,13 +694,13 @@ class Language:
|
||||||
|
|
||||||
def create_pipe_from_source(
|
def create_pipe_from_source(
|
||||||
self, source_name: str, source: "Language", *, name: str
|
self, source_name: str, source: "Language", *, name: str
|
||||||
) -> Tuple["Pipe", str]:
|
) -> Tuple[PipeCallable, str]:
|
||||||
"""Create a pipeline component by copying it from an existing model.
|
"""Create a pipeline component by copying it from an existing model.
|
||||||
|
|
||||||
source_name (str): Name of the component in the source pipeline.
|
source_name (str): Name of the component in the source pipeline.
|
||||||
source (Language): The source nlp object to copy from.
|
source (Language): The source nlp object to copy from.
|
||||||
name (str): Optional alternative name to use in current pipeline.
|
name (str): Optional alternative name to use in current pipeline.
|
||||||
RETURNS (Tuple[Callable, str]): The component and its factory name.
|
RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name.
|
||||||
"""
|
"""
|
||||||
# Check source type
|
# Check source type
|
||||||
if not isinstance(source, Language):
|
if not isinstance(source, Language):
|
||||||
|
@ -740,7 +739,7 @@ class Language:
|
||||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
raw_config: Optional[Config] = None,
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> "Pipe":
|
) -> PipeCallable:
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
callables that take a `Doc` object, modify it and return it. Only one
|
callables that take a `Doc` object, modify it and return it. Only one
|
||||||
of before/after/first/last can be set. Default behaviour is "last".
|
of before/after/first/last can be set. Default behaviour is "last".
|
||||||
|
@ -763,7 +762,7 @@ class Language:
|
||||||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Pipe): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#add_pipe
|
DOCS: https://spacy.io/api/language#add_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -869,7 +868,7 @@ class Language:
|
||||||
*,
|
*,
|
||||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> "Pipe":
|
) -> PipeCallable:
|
||||||
"""Replace a component in the pipeline.
|
"""Replace a component in the pipeline.
|
||||||
|
|
||||||
name (str): Name of the component to replace.
|
name (str): Name of the component to replace.
|
||||||
|
@ -878,7 +877,7 @@ class Language:
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Pipe): The new pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#replace_pipe
|
DOCS: https://spacy.io/api/language#replace_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -930,11 +929,11 @@ class Language:
|
||||||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||||
self._config["initialize"]["components"][new_name] = init_cfg
|
self._config["initialize"]["components"][new_name] = init_cfg
|
||||||
|
|
||||||
def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
|
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||||||
"""Remove a component from the pipeline.
|
"""Remove a component from the pipeline.
|
||||||
|
|
||||||
name (str): Name of the component to remove.
|
name (str): Name of the component to remove.
|
||||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#remove_pipe
|
DOCS: https://spacy.io/api/language#remove_pipe
|
||||||
"""
|
"""
|
||||||
|
@ -1349,15 +1348,15 @@ class Language:
|
||||||
|
|
||||||
def set_error_handler(
|
def set_error_handler(
|
||||||
self,
|
self,
|
||||||
error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
|
error_handler: Callable[[str, PipeCallable, List[Doc], Exception], NoReturn],
|
||||||
):
|
):
|
||||||
"""Set an error handler object for all the components in the pipeline that implement
|
"""Set an error handler object for all the components in the pipeline
|
||||||
a set_error_handler function.
|
that implement a set_error_handler function.
|
||||||
|
|
||||||
error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
|
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]):
|
||||||
Function that deals with a failing batch of documents. This callable function should take in
|
Function that deals with a failing batch of documents. This callable
|
||||||
the component's name, the component itself, the offending batch of documents, and the exception
|
function should take in the component's name, the component itself,
|
||||||
that was thrown.
|
the offending batch of documents, and the exception that was thrown.
|
||||||
DOCS: https://spacy.io/api/language#set_error_handler
|
DOCS: https://spacy.io/api/language#set_error_handler
|
||||||
"""
|
"""
|
||||||
self.default_error_handler = error_handler
|
self.default_error_handler = error_handler
|
||||||
|
|
|
@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
tree = dict(tree)
|
tree = dict(tree)
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||||
if "orig" in tree:
|
if "orig" in tree:
|
||||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||||
|
|
||||||
trees.append(tree)
|
trees.append(tree)
|
||||||
|
|
||||||
|
|
|
@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
|
||||||
DOCS: https://spacy.io/api/spancategorizer#predict
|
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||||
"""
|
"""
|
||||||
indices = self.suggester(docs, ops=self.model.ops)
|
indices = self.suggester(docs, ops=self.model.ops)
|
||||||
scores = self.model.predict((docs, indices)) # type: ignore
|
if indices.lengths.sum() == 0:
|
||||||
|
scores = self.model.ops.alloc2f(0, 0)
|
||||||
|
else:
|
||||||
|
scores = self.model.predict((docs, indices)) # type: ignore
|
||||||
return indices, scores
|
return indices, scores
|
||||||
|
|
||||||
def set_candidates(
|
def set_candidates(
|
||||||
|
|
|
@ -87,7 +87,6 @@ subword_features = true
|
||||||
"cats_macro_f": None,
|
"cats_macro_f": None,
|
||||||
"cats_macro_auc": None,
|
"cats_macro_auc": None,
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
"cats_macro_auc_per_type": None,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_textcat(
|
def make_textcat(
|
||||||
|
|
|
@ -87,7 +87,6 @@ subword_features = true
|
||||||
"cats_macro_f": None,
|
"cats_macro_f": None,
|
||||||
"cats_macro_auc": None,
|
"cats_macro_auc": None,
|
||||||
"cats_f_per_type": None,
|
"cats_f_per_type": None,
|
||||||
"cats_macro_auc_per_type": None,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_multilabel_textcat(
|
def make_multilabel_textcat(
|
||||||
|
|
|
@ -60,10 +60,45 @@ def test_initialize_from_labels():
|
||||||
nlp2 = Language()
|
nlp2 = Language()
|
||||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||||
lemmatizer2.initialize(
|
lemmatizer2.initialize(
|
||||||
get_examples=lambda: train_examples,
|
# We want to check that the strings in replacement nodes are
|
||||||
|
# added to the string store. Avoid that they get added through
|
||||||
|
# the examples.
|
||||||
|
get_examples=lambda: train_examples[:1],
|
||||||
labels=lemmatizer.label_data,
|
labels=lemmatizer.label_data,
|
||||||
)
|
)
|
||||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||||
|
assert lemmatizer2.label_data == {
|
||||||
|
"trees": [
|
||||||
|
{"orig": "S", "subst": "s"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 0,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "s", "subst": ""},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 1,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prefix_len": 0,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 4294967295,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
{"orig": "E", "subst": "e"},
|
||||||
|
{
|
||||||
|
"prefix_len": 1,
|
||||||
|
"suffix_len": 0,
|
||||||
|
"prefix_tree": 5,
|
||||||
|
"suffix_tree": 4294967295,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"labels": (1, 3, 4, 6),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_no_data():
|
def test_no_data():
|
||||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
||||||
|
|
||||||
|
|
||||||
def test_zero_suggestions():
|
def test_zero_suggestions():
|
||||||
# Test with a suggester that returns 0 suggestions
|
# Test with a suggester that can return 0 suggestions
|
||||||
|
|
||||||
@registry.misc("test_zero_suggester")
|
@registry.misc("test_mixed_zero_suggester")
|
||||||
def make_zero_suggester():
|
def make_mixed_zero_suggester():
|
||||||
def zero_suggester(docs, *, ops=None):
|
def mixed_zero_suggester(docs, *, ops=None):
|
||||||
if ops is None:
|
if ops is None:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
return Ragged(
|
spans = []
|
||||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
lengths = []
|
||||||
)
|
for doc in docs:
|
||||||
|
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||||
|
spans.append((0, 1))
|
||||||
|
lengths.append(1)
|
||||||
|
else:
|
||||||
|
lengths.append(0)
|
||||||
|
spans = ops.asarray2i(spans)
|
||||||
|
lengths_array = ops.asarray1i(lengths)
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
return output
|
||||||
|
|
||||||
return zero_suggester
|
return mixed_zero_suggester
|
||||||
|
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
"spancat",
|
"spancat",
|
||||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
config={
|
||||||
|
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
@ -397,6 +412,16 @@ def test_zero_suggestions():
|
||||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
nlp.update(train_examples, sgd=optimizer)
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
# empty doc
|
||||||
|
nlp("")
|
||||||
|
# single doc with zero suggestions
|
||||||
|
nlp("one")
|
||||||
|
# single doc with one suggestion
|
||||||
|
nlp("two two")
|
||||||
|
# batch with mixed zero/one suggestions
|
||||||
|
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||||
|
# batch with no suggestions
|
||||||
|
list(nlp.pipe(["", "one", "three three three"]))
|
||||||
|
|
||||||
|
|
||||||
def test_set_candidates():
|
def test_set_candidates():
|
||||||
|
|
|
@ -838,8 +838,8 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
|
||||||
textcat = nlp.add_pipe("textcat_multilabel")
|
textcat = nlp.add_pipe("textcat_multilabel")
|
||||||
else:
|
else:
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.initialize(lambda: train_examples)
|
|
||||||
assert isinstance(textcat, TextCategorizer)
|
assert isinstance(textcat, TextCategorizer)
|
||||||
|
textcat.initialize(lambda: train_examples)
|
||||||
scores = textcat.model.ops.asarray(
|
scores = textcat.model.ops.asarray(
|
||||||
[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore
|
[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,6 +3,7 @@ import math
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from typing import Tuple, List, Dict, Any
|
from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
import time
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -28,6 +29,7 @@ from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
from spacy.cli.project.run import _check_requirements
|
from spacy.cli.project.run import _check_requirements
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
from spacy.cli.find_threshold import find_threshold
|
from spacy.cli.find_threshold import find_threshold
|
||||||
|
@ -121,6 +123,25 @@ def test_issue7055():
|
||||||
assert "model" in filled_cfg["components"]["ner"]
|
assert "model" in filled_cfg["components"]["ner"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11235)
|
||||||
|
def test_issue11235():
|
||||||
|
"""
|
||||||
|
Test that the cli handles interpolation in the directory names correctly when loading project config.
|
||||||
|
"""
|
||||||
|
lang_var = "en"
|
||||||
|
variables = {"lang": lang_var}
|
||||||
|
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
|
||||||
|
directories = ["cfg", "${vars.lang}_model"]
|
||||||
|
project = {"commands": commands, "vars": variables, "directories": directories}
|
||||||
|
with make_tempdir() as d:
|
||||||
|
srsly.write_yaml(d / "project.yml", project)
|
||||||
|
cfg = load_project_config(d)
|
||||||
|
# Check that the directories are interpolated and created correctly
|
||||||
|
assert os.path.exists(d / "cfg")
|
||||||
|
assert os.path.exists(d / f"{lang_var}_model")
|
||||||
|
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||||
|
|
||||||
|
|
||||||
def test_cli_info():
|
def test_cli_info():
|
||||||
nlp = Dutch()
|
nlp = Dutch()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
|
@ -594,6 +615,7 @@ def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||||
def test_download_compatibility():
|
def test_download_compatibility():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
@ -604,6 +626,7 @@ def test_download_compatibility():
|
||||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Temporarily skip for dev version")
|
||||||
def test_validate_compatibility_table():
|
def test_validate_compatibility_table():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
@ -862,6 +885,60 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_remote_storage():
|
||||||
|
with make_tempdir() as d:
|
||||||
|
filename = "a.txt"
|
||||||
|
|
||||||
|
content_hashes = ("aaaa", "cccc", "bbbb")
|
||||||
|
for i, content_hash in enumerate(content_hashes):
|
||||||
|
# make sure that each subsequent file has a later timestamp
|
||||||
|
if i > 0:
|
||||||
|
time.sleep(1)
|
||||||
|
content = f"{content_hash} content"
|
||||||
|
loc_file = d / "root" / filename
|
||||||
|
if not loc_file.parent.exists():
|
||||||
|
loc_file.parent.mkdir(parents=True)
|
||||||
|
with loc_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
|
||||||
|
# push first version to remote storage
|
||||||
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
|
remote.push(filename, "aaaa", content_hash)
|
||||||
|
|
||||||
|
# retrieve with full hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with command hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa")
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with content hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with no hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_remote_storage_pull_missing():
|
||||||
|
# pulling from a non-existent remote pulls nothing gracefully
|
||||||
|
with make_tempdir() as d:
|
||||||
|
filename = "a.txt"
|
||||||
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
|
assert remote.pull(filename, command_hash="aaaa") is None
|
||||||
|
assert remote.pull(filename) is None
|
||||||
|
|
||||||
|
|
||||||
def test_cli_find_threshold(capsys):
|
def test_cli_find_threshold(capsys):
|
||||||
thresholds = numpy.linspace(0, 1, 10)
|
thresholds = numpy.linspace(0, 1, 10)
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,7 @@ from . import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# This lets us add type hints for mypy etc. without causing circular imports
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
from .language import Language # noqa: F401
|
from .language import Language, PipeCallable # noqa: F401
|
||||||
from .pipeline import Pipe # noqa: F401
|
|
||||||
from .tokens import Doc, Span # noqa: F401
|
from .tokens import Doc, Span # noqa: F401
|
||||||
from .vocab import Vocab # noqa: F401
|
from .vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
@ -1642,9 +1641,11 @@ def check_bool_env_var(env_var: str) -> bool:
|
||||||
|
|
||||||
def _pipe(
|
def _pipe(
|
||||||
docs: Iterable["Doc"],
|
docs: Iterable["Doc"],
|
||||||
proc: "Pipe",
|
proc: "PipeCallable",
|
||||||
name: str,
|
name: str,
|
||||||
default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
|
default_error_handler: Callable[
|
||||||
|
[str, "PipeCallable", List["Doc"], Exception], NoReturn
|
||||||
|
],
|
||||||
kwargs: Mapping[str, Any],
|
kwargs: Mapping[str, Any],
|
||||||
) -> Iterator["Doc"]:
|
) -> Iterator["Doc"]:
|
||||||
if hasattr(proc, "pipe"):
|
if hasattr(proc, "pipe"):
|
||||||
|
|
|
@ -1,531 +1,11 @@
|
||||||
<Comment>
|
|
||||||
|
|
||||||
# spacy.io website and docs
|
# spacy.io website and docs
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
_This page contains the documentation and styleguide for the spaCy website. Its
|
The styleguide for the spaCy website is available at
|
||||||
rendered version is available at https://spacy.io/styleguide._
|
[spacy.io/styleguide](https://spacy.io/styleguide).
|
||||||
|
|
||||||
---
|
## Setup and installation
|
||||||
|
|
||||||
</Comment>
|
|
||||||
|
|
||||||
The [spacy.io](https://spacy.io) website is implemented using
|
|
||||||
[Gatsby](https://www.gatsbyjs.org) with
|
|
||||||
[Remark](https://github.com/remarkjs/remark) and [MDX](https://mdxjs.com/). This
|
|
||||||
allows authoring content in **straightforward Markdown** without the usual
|
|
||||||
limitations. Standard elements can be overwritten with powerful
|
|
||||||
[React](http://reactjs.org/) components and wherever Markdown syntax isn't
|
|
||||||
enough, JSX components can be used.
|
|
||||||
|
|
||||||
> #### Contributing to the site
|
|
||||||
>
|
|
||||||
> The docs can always use another example or more detail, and they should always
|
|
||||||
> be up to date and not misleading. We always appreciate a
|
|
||||||
> [pull request](https://github.com/explosion/spaCy/pulls). To quickly find the
|
|
||||||
> correct file to edit, simply click on the "Suggest edits" button at the bottom
|
|
||||||
> of a page.
|
|
||||||
>
|
|
||||||
> For more details on editing the site locally, see the installation
|
|
||||||
> instructions and markdown reference below.
|
|
||||||
|
|
||||||
## Logo {#logo source="website/src/images/logo.svg"}
|
|
||||||
|
|
||||||
import { Logos } from 'widgets/styleguide'
|
|
||||||
|
|
||||||
If you would like to use the spaCy logo on your site, please get in touch and
|
|
||||||
ask us first. However, if you want to show support and tell others that your
|
|
||||||
project is using spaCy, you can grab one of our
|
|
||||||
[spaCy badges](/usage/spacy-101#faq-project-with-spacy).
|
|
||||||
|
|
||||||
<Logos />
|
|
||||||
|
|
||||||
## Colors {#colors}
|
|
||||||
|
|
||||||
import { Colors, Patterns } from 'widgets/styleguide'
|
|
||||||
|
|
||||||
<Colors />
|
|
||||||
|
|
||||||
### Patterns
|
|
||||||
|
|
||||||
<Patterns />
|
|
||||||
|
|
||||||
## Typography {#typography}
|
|
||||||
|
|
||||||
import { H1, H2, H3, H4, H5, Label, InlineList, Comment } from
|
|
||||||
'components/typography'
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> ## Headline 2
|
|
||||||
> ## Headline 2 {#some_id}
|
|
||||||
> ## Headline 2 {#some_id tag="method"}
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <H2>Headline 2</H2>
|
|
||||||
> <H2 id="some_id">Headline 2</H2>
|
|
||||||
> <H2 id="some_id" tag="method">Headline 2</H2>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Headlines are set in
|
|
||||||
[HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by
|
|
||||||
Hanken Design. All other body text and code uses the best-matching default
|
|
||||||
system font to provide a "native" reading experience. All code uses the
|
|
||||||
[JetBrains Mono](https://www.jetbrains.com/lp/mono/) typeface by JetBrains.
|
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
|
||||||
|
|
||||||
Level 2 headings are automatically wrapped in `<section>` elements at compile
|
|
||||||
time, using a custom
|
|
||||||
[Markdown transformer](https://github.com/explosion/spaCy/tree/master/website/plugins/remark-wrap-section.js).
|
|
||||||
This makes it easier to highlight the section that's currently in the viewpoint
|
|
||||||
in the sidebar menu.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<H1>Headline 1</H1>
|
|
||||||
<H2>Headline 2</H2>
|
|
||||||
<H3>Headline 3</H3>
|
|
||||||
<H4>Headline 4</H4>
|
|
||||||
<H5>Headline 5</H5>
|
|
||||||
<Label>Label</Label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
The following optional attributes can be set on the headline to modify it. For
|
|
||||||
example, to add a tag for the documented type or mark features that have been
|
|
||||||
introduced in a specific version or require statistical models to be loaded.
|
|
||||||
Tags are also available as standalone `<Tag />` components.
|
|
||||||
|
|
||||||
| Argument | Example | Result |
|
|
||||||
| -------- | -------------------------- | ----------------------------------------- |
|
|
||||||
| `tag` | `{tag="method"}` | <Tag>method</Tag> |
|
|
||||||
| `new` | `{new="3"}` | <Tag variant="new">3</Tag> |
|
|
||||||
| `model` | `{model="tagger, parser"}` | <Tag variant="model">tagger, parser</Tag> |
|
|
||||||
| `hidden` | `{hidden="true"}` | |
|
|
||||||
|
|
||||||
## Elements {#elements}
|
|
||||||
|
|
||||||
### Links {#links}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown
|
|
||||||
> [I am a link](https://spacy.io)
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <Link to="https://spacy.io">I am a link</Link>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Special link styles are used depending on the link URL.
|
|
||||||
|
|
||||||
- [I am a regular external link](https://explosion.ai)
|
|
||||||
- [I am a link to the documentation](/api/doc)
|
|
||||||
- [I am a link to an architecture](/api/architectures#HashEmbedCNN)
|
|
||||||
- [I am a link to a model](/models/en#en_core_web_sm)
|
|
||||||
- [I am a link to GitHub](https://github.com/explosion/spaCy)
|
|
||||||
|
|
||||||
### Abbreviations {#abbr}
|
|
||||||
|
|
||||||
import { Abbr } from 'components/typography'
|
|
||||||
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <Abbr title="Explanation">Abbreviation</Abbr>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Some text with <Abbr title="Explanation here">an abbreviation</Abbr>. On small
|
|
||||||
screens, I collapse and the explanation text is displayed next to the
|
|
||||||
abbreviation.
|
|
||||||
|
|
||||||
### Tags {#tags}
|
|
||||||
|
|
||||||
import Tag from 'components/tag'
|
|
||||||
|
|
||||||
> ```jsx
|
|
||||||
> <Tag>method</Tag>
|
|
||||||
> <Tag variant="new">4</Tag>
|
|
||||||
> <Tag variant="model">tagger, parser</Tag>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Tags can be used together with headlines, or next to properties across the
|
|
||||||
documentation, and combined with tooltips to provide additional information. An
|
|
||||||
optional `variant` argument can be used for special tags. `variant="new"` makes
|
|
||||||
the tag take a version number to mark new features. Using the component,
|
|
||||||
visibility of this tag can later be toggled once the feature isn't considered
|
|
||||||
new anymore. Setting `variant="model"` takes a description of model capabilities
|
|
||||||
and can be used to mark features that require a respective model to be
|
|
||||||
installed.
|
|
||||||
|
|
||||||
<InlineList>
|
|
||||||
|
|
||||||
<Tag>method</Tag> <Tag variant="new">4</Tag> <Tag variant="model">tagger,
|
|
||||||
parser</Tag>
|
|
||||||
|
|
||||||
</InlineList>
|
|
||||||
|
|
||||||
### Buttons {#buttons}
|
|
||||||
|
|
||||||
import Button from 'components/button'
|
|
||||||
|
|
||||||
> ```jsx
|
|
||||||
> <Button to="#" variant="primary">Primary small</Button>
|
|
||||||
> <Button to="#" variant="secondary">Secondary small</Button>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Link buttons come in two variants, `primary` and `secondary` and two sizes, with
|
|
||||||
an optional `large` size modifier. Since they're mostly used as enhanced links,
|
|
||||||
the buttons are implemented as styled links instead of native button elements.
|
|
||||||
|
|
||||||
<InlineList><Button to="#" variant="primary">Primary small</Button>
|
|
||||||
<Button to="#" variant="secondary">Secondary small</Button></InlineList>
|
|
||||||
|
|
||||||
<br />
|
|
||||||
|
|
||||||
<InlineList><Button to="#" variant="primary" large>Primary large</Button>
|
|
||||||
<Button to="#" variant="secondary" large>Secondary large</Button></InlineList>
|
|
||||||
|
|
||||||
## Components
|
|
||||||
|
|
||||||
### Table {#table}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> | Header 1 | Header 2 |
|
|
||||||
> | -------- | -------- |
|
|
||||||
> | Column 1 | Column 2 |
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```markup
|
|
||||||
> <Table>
|
|
||||||
> <Tr><Th>Header 1</Th><Th>Header 2</Th></Tr></thead>
|
|
||||||
> <Tr><Td>Column 1</Td><Td>Column 2</Td></Tr>
|
|
||||||
> </Table>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Tables are used to present data and API documentation. Certain keywords can be
|
|
||||||
used to mark a footer row with a distinct style, for example to visualize the
|
|
||||||
return values of a documented function.
|
|
||||||
|
|
||||||
| Header 1 | Header 2 | Header 3 | Header 4 |
|
|
||||||
| ----------- | -------- | :------: | -------: |
|
|
||||||
| Column 1 | Column 2 | Column 3 | Column 4 |
|
|
||||||
| Column 1 | Column 2 | Column 3 | Column 4 |
|
|
||||||
| Column 1 | Column 2 | Column 3 | Column 4 |
|
|
||||||
| Column 1 | Column 2 | Column 3 | Column 4 |
|
|
||||||
| **RETURNS** | Column 2 | Column 3 | Column 4 |
|
|
||||||
|
|
||||||
Tables also support optional "divider" rows that are typically used to denote
|
|
||||||
keyword-only arguments in API documentation. To turn a row into a dividing
|
|
||||||
headline, it should only include content in its first cell, and its value should
|
|
||||||
be italicized:
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> | Header 1 | Header 2 | Header 3 |
|
|
||||||
> | -------- | -------- | -------- |
|
|
||||||
> | Column 1 | Column 2 | Column 3 |
|
|
||||||
> | _Hello_ | | |
|
|
||||||
> | Column 1 | Column 2 | Column 3 |
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Header 1 | Header 2 | Header 3 |
|
|
||||||
| -------- | -------- | -------- |
|
|
||||||
| Column 1 | Column 2 | Column 3 |
|
|
||||||
| _Hello_ | | |
|
|
||||||
| Column 1 | Column 2 | Column 3 |
|
|
||||||
|
|
||||||
### Type Annotations {#type-annotations}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> ~~Model[List[Doc], Floats2d]~~
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```markup
|
|
||||||
> <TypeAnnotation>Model[List[Doc], Floats2d]</Typeannotation>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Type annotations are special inline code blocks are used to describe Python
|
|
||||||
types in the [type hints](https://docs.python.org/3/library/typing.html) format.
|
|
||||||
The special component will split the type, apply syntax highlighting and link
|
|
||||||
all types that specify links in `meta/type-annotations.json`. Types can link to
|
|
||||||
internal or external documentation pages. To make it easy to represent the type
|
|
||||||
annotations in Markdown, the rendering "hijacks" the `~~` tags that would
|
|
||||||
typically be converted to a `<del>` element – but in this case, text surrounded
|
|
||||||
by `~~` becomes a type annotation.
|
|
||||||
|
|
||||||
- ~~Dict[str, List[Union[Doc, Span]]]~~
|
|
||||||
- ~~Model[List[Doc], List[numpy.ndarray]]~~
|
|
||||||
|
|
||||||
Type annotations support a special visual style in tables and will render as a
|
|
||||||
separate row, under the cell text. This allows the API docs to display complex
|
|
||||||
types without taking up too much space in the cell. The type annotation should
|
|
||||||
always be the **last element** in the row.
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> | Header 1 | Header 2 |
|
|
||||||
> | -------- | ----------------------- |
|
|
||||||
> | Column 1 | Column 2 ~~List[Doc]~~ |
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
|
||||||
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
|
||||||
|
|
||||||
### List {#list}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> 1. One
|
|
||||||
> 2. Two
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```markup
|
|
||||||
> <Ol>
|
|
||||||
> <Li>One</Li>
|
|
||||||
> <Li>Two</Li>
|
|
||||||
> </Ol>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Lists are available as bulleted and numbered. Markdown lists are transformed
|
|
||||||
automatically.
|
|
||||||
|
|
||||||
- I am a bulleted list
|
|
||||||
- I have nice bullets
|
|
||||||
- Lorem ipsum dolor
|
|
||||||
- consectetur adipiscing elit
|
|
||||||
|
|
||||||
1. I am an ordered list
|
|
||||||
2. I have nice numbers
|
|
||||||
3. Lorem ipsum dolor
|
|
||||||
4. consectetur adipiscing elit
|
|
||||||
|
|
||||||
### Aside {#aside}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ```markdown_
|
|
||||||
> > #### Aside title
|
|
||||||
> > This is aside text.
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <Aside title="Aside title">This is aside text.</Aside>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Asides can be used to display additional notes and content in the right-hand
|
|
||||||
column. Asides can contain text, code and other elements if needed. Visually,
|
|
||||||
asides are moved to the side on the X-axis, and displayed at the same level they
|
|
||||||
were inserted. On small screens, they collapse and are rendered in their
|
|
||||||
original position, in between the text.
|
|
||||||
|
|
||||||
To make them easier to use in Markdown, paragraphs formatted as blockquotes will
|
|
||||||
turn into asides by default. Level 4 headlines (with a leading `####`) will
|
|
||||||
become aside titles.
|
|
||||||
|
|
||||||
### Code Block {#code-block}
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ````markdown_
|
|
||||||
> ```python
|
|
||||||
> ### This is a title
|
|
||||||
> import spacy
|
|
||||||
> ```
|
|
||||||
> ````
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <CodeBlock title="This is a title" lang="python">
|
|
||||||
> import spacy
|
|
||||||
> </CodeBlock>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Code blocks use the [Prism](http://prismjs.com/) syntax highlighter with a
|
|
||||||
custom theme. The language can be set individually on each block, and defaults
|
|
||||||
to raw text with no highlighting. An optional label can be added as the first
|
|
||||||
line with the prefix `####` (Python-like) and `///` (JavaScript-like). the
|
|
||||||
indented block as plain text and preserve whitespace.
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Using spaCy
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
doc = nlp("This is a sentence.")
|
|
||||||
for token in doc:
|
|
||||||
print(token.text, token.pos_)
|
|
||||||
```
|
|
||||||
|
|
||||||
Code blocks and also specify an optional range of line numbers to highlight by
|
|
||||||
adding `{highlight="..."}` to the headline. Acceptable ranges are spans like
|
|
||||||
`5-7`, but also `5-7,10` or `5-7,10,13-14`.
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ````markdown_
|
|
||||||
> ```python
|
|
||||||
> ### This is a title {highlight="1-2"}
|
|
||||||
> import spacy
|
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
|
||||||
> ```
|
|
||||||
> ````
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Using the matcher {highlight="5-7"}
|
|
||||||
import spacy
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
|
||||||
matcher = Matcher(nlp.vocab)
|
|
||||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
|
|
||||||
matcher.add("HelloWorld", None, pattern)
|
|
||||||
doc = nlp("Hello, world! Hello world!")
|
|
||||||
matches = matcher(doc)
|
|
||||||
```
|
|
||||||
|
|
||||||
Adding `{executable="true"}` to the title turns the code into an executable
|
|
||||||
block, powered by [Binder](https://mybinder.org) and
|
|
||||||
[Juniper](https://github.com/ines/juniper). If JavaScript is disabled, the
|
|
||||||
interactive widget defaults to a regular code block.
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ````markdown_
|
|
||||||
> ```python
|
|
||||||
> ### {executable="true"}
|
|
||||||
> import spacy
|
|
||||||
> nlp = spacy.load("en_core_web_sm")
|
|
||||||
> ```
|
|
||||||
> ````
|
|
||||||
|
|
||||||
```python
|
|
||||||
### {executable="true"}
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
doc = nlp("This is a sentence.")
|
|
||||||
for token in doc:
|
|
||||||
print(token.text, token.pos_)
|
|
||||||
```
|
|
||||||
|
|
||||||
If a code block only contains a URL to a GitHub file, the raw file contents are
|
|
||||||
embedded automatically and syntax highlighting is applied. The link to the
|
|
||||||
original file is shown at the top of the widget.
|
|
||||||
|
|
||||||
> #### Markdown
|
|
||||||
>
|
|
||||||
> ````markdown_
|
|
||||||
> ```python
|
|
||||||
> https://github.com/...
|
|
||||||
> ```
|
|
||||||
> ````
|
|
||||||
>
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <GitHubCode url="https://github.com/..." lang="python" />
|
|
||||||
> ```
|
|
||||||
|
|
||||||
```python
|
|
||||||
https://github.com/explosion/spaCy/tree/master/spacy/language.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Infobox {#infobox}
|
|
||||||
|
|
||||||
import Infobox from 'components/infobox'
|
|
||||||
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <Infobox title="Information">Regular infobox</Infobox>
|
|
||||||
> <Infobox title="Important note" variant="warning">This is a warning.</Infobox>
|
|
||||||
> <Infobox title="Be careful!" variant="danger">This is dangerous.</Infobox>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Infoboxes can be used to add notes, updates, warnings or additional information
|
|
||||||
to a page or section. Semantically, they're implemented and interpreted as an
|
|
||||||
`aside` element. Infoboxes can take an optional `title` argument, as well as an
|
|
||||||
optional `variant` (either `"warning"` or `"danger"`).
|
|
||||||
|
|
||||||
<Infobox title="This is an infobox">
|
|
||||||
|
|
||||||
If needed, an infobox can contain regular text, `inline code`, lists and other
|
|
||||||
blocks.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
<Infobox title="This is a warning" variant="warning">
|
|
||||||
|
|
||||||
If needed, an infobox can contain regular text, `inline code`, lists and other
|
|
||||||
blocks.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
<Infobox title="This is dangerous" variant="danger">
|
|
||||||
|
|
||||||
If needed, an infobox can contain regular text, `inline code`, lists and other
|
|
||||||
blocks.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
### Accordion {#accordion}
|
|
||||||
|
|
||||||
import Accordion from 'components/accordion'
|
|
||||||
|
|
||||||
> #### JSX
|
|
||||||
>
|
|
||||||
> ```jsx
|
|
||||||
> <Accordion title="This is an accordion">
|
|
||||||
> Accordion content goes here.
|
|
||||||
> </Accordion>
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Accordions are collapsible sections that are mostly used for lengthy tables,
|
|
||||||
like the tag and label annotation schemes for different languages. They all need
|
|
||||||
to be presented – but chances are the user doesn't actually care about _all_ of
|
|
||||||
them, especially not at the same time. So it's fairly reasonable to hide them
|
|
||||||
begin a click. This particular implementation was inspired by the amazing
|
|
||||||
[Inclusive Components blog](https://inclusive-components.design/collapsible-sections/).
|
|
||||||
|
|
||||||
<Accordion title="This is an accordion">
|
|
||||||
|
|
||||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante,
|
|
||||||
pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt
|
|
||||||
nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor
|
|
||||||
gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor,
|
|
||||||
sit amet dignissim justo congue.
|
|
||||||
|
|
||||||
</Accordion>
|
|
||||||
|
|
||||||
## Setup and installation {#setup}
|
|
||||||
|
|
||||||
Before running the setup, make sure your versions of
|
Before running the setup, make sure your versions of
|
||||||
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
|
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
|
||||||
|
@ -554,14 +34,14 @@ extensions for your code editor. The
|
||||||
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
|
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
|
||||||
file in the root defines the settings used in this codebase.
|
file in the root defines the settings used in this codebase.
|
||||||
|
|
||||||
## Building & developing the site with Docker {#docker}
|
## Building & developing the site with Docker
|
||||||
Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
|
|
||||||
so it may be easier to use docker for building the docs.
|
|
||||||
|
|
||||||
If you'd like to do this,
|
Sometimes it's hard to get a local environment working due to rapid updates to
|
||||||
**be sure you do *not* include your local `node_modules` folder**,
|
node dependencies, so it may be easier to use docker for building the docs.
|
||||||
since there are some dependencies that need to be built for the image system.
|
|
||||||
Rename it before using.
|
If you'd like to do this, **be sure you do _not_ include your local
|
||||||
|
`node_modules` folder**, since there are some dependencies that need to be built
|
||||||
|
for the image system. Rename it before using.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -it \
|
docker run -it \
|
||||||
|
@ -571,16 +51,16 @@ docker run -it \
|
||||||
gatsby develop -H 0.0.0.0
|
gatsby develop -H 0.0.0.0
|
||||||
```
|
```
|
||||||
|
|
||||||
This will allow you to access the built website at http://0.0.0.0:8000/
|
This will allow you to access the built website at http://0.0.0.0:8000/ in your
|
||||||
in your browser, and still edit code in your editor while having the site
|
browser, and still edit code in your editor while having the site reflect those
|
||||||
reflect those changes.
|
changes.
|
||||||
|
|
||||||
**Note**: If you're working on a Mac with an M1 processor,
|
**Note**: If you're working on a Mac with an M1 processor, you might see
|
||||||
you might see segfault errors from `qemu` if you use the default image.
|
segfault errors from `qemu` if you use the default image. To fix this use the
|
||||||
To fix this use the `arm64` tagged image in the `docker run` command
|
`arm64` tagged image in the `docker run` command
|
||||||
(ghcr.io/explosion/spacy-io:arm64).
|
(ghcr.io/explosion/spacy-io:arm64).
|
||||||
|
|
||||||
### Building the Docker image {#docker-build}
|
### Building the Docker image
|
||||||
|
|
||||||
If you'd like to build the image locally, you can do so like this:
|
If you'd like to build the image locally, you can do so like this:
|
||||||
|
|
||||||
|
@ -588,67 +68,21 @@ If you'd like to build the image locally, you can do so like this:
|
||||||
docker build -t spacy-io .
|
docker build -t spacy-io .
|
||||||
```
|
```
|
||||||
|
|
||||||
This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
|
This will take some time, so if you want to use the prebuilt image you'll save a
|
||||||
|
bit of time.
|
||||||
|
|
||||||
## Markdown reference {#markdown}
|
## Project structure
|
||||||
|
|
||||||
All page content and page meta lives in the `.md` files in the `/docs`
|
|
||||||
directory. The frontmatter block at the top of each file defines the page title
|
|
||||||
and other settings like the sidebar menu.
|
|
||||||
|
|
||||||
````markdown
|
|
||||||
---
|
|
||||||
title: Page title
|
|
||||||
---
|
|
||||||
|
|
||||||
## Headline starting a section {#some_id}
|
|
||||||
|
|
||||||
This is a regular paragraph with a [link](https://spacy.io) and **bold text**.
|
|
||||||
|
|
||||||
> #### This is an aside title
|
|
||||||
>
|
|
||||||
> This is aside text.
|
|
||||||
|
|
||||||
### Subheadline
|
|
||||||
|
|
||||||
| Header 1 | Header 2 |
|
|
||||||
| -------- | -------- |
|
|
||||||
| Column 1 | Column 2 |
|
|
||||||
|
|
||||||
```python
|
|
||||||
### Code block title {highlight="2-3"}
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
doc = nlp("Hello world")
|
|
||||||
```
|
|
||||||
|
|
||||||
<Infobox title="Important note" variant="warning">
|
|
||||||
|
|
||||||
This is content in the infobox.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
````
|
|
||||||
|
|
||||||
In addition to the native markdown elements, you can use the components
|
|
||||||
[`<Infobox />`][infobox], [`<Accordion />`][accordion], [`<Abbr />`][abbr] and
|
|
||||||
[`<Tag />`][tag] via their JSX syntax.
|
|
||||||
|
|
||||||
[infobox]: https://spacy.io/styleguide#infobox
|
|
||||||
[accordion]: https://spacy.io/styleguide#accordion
|
|
||||||
[abbr]: https://spacy.io/styleguide#abbr
|
|
||||||
[tag]: https://spacy.io/styleguide#tag
|
|
||||||
|
|
||||||
## Project structure {#structure}
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
### Directory structure
|
|
||||||
├── docs # the actual markdown content
|
├── docs # the actual markdown content
|
||||||
├── meta # JSON-formatted site metadata
|
├── meta # JSON-formatted site metadata
|
||||||
| ├── languages.json # supported languages and statistical models
|
| ├── languages.json # supported languages and statistical models
|
||||||
| ├── sidebars.json # sidebar navigations for different sections
|
| ├── sidebars.json # sidebar navigations for different sections
|
||||||
| ├── site.json # general site metadata
|
| ├── site.json # general site metadata
|
||||||
|
| ├── type-annotations.json # Type annotations
|
||||||
| └── universe.json # data for the spaCy universe section
|
| └── universe.json # data for the spaCy universe section
|
||||||
├── public # compiled site
|
├── public # compiled site
|
||||||
|
├── setup # Jinja setup
|
||||||
├── src # source
|
├── src # source
|
||||||
| ├── components # React components
|
| ├── components # React components
|
||||||
| ├── fonts # webfonts
|
| ├── fonts # webfonts
|
||||||
|
@ -661,54 +95,10 @@ In addition to the native markdown elements, you can use the components
|
||||||
| | ├── models.js # layout template for model pages
|
| | ├── models.js # layout template for model pages
|
||||||
| | └── universe.js # layout templates for universe
|
| | └── universe.js # layout templates for universe
|
||||||
| └── widgets # non-reusable components with content, e.g. changelog
|
| └── widgets # non-reusable components with content, e.g. changelog
|
||||||
|
├── .eslintrc.json # ESLint config file
|
||||||
|
├── .prettierrc # Prettier config file
|
||||||
├── gatsby-browser.js # browser-specific hooks for Gatsby
|
├── gatsby-browser.js # browser-specific hooks for Gatsby
|
||||||
├── gatsby-config.js # Gatsby configuration
|
├── gatsby-config.js # Gatsby configuration
|
||||||
├── gatsby-node.js # Node-specific hooks for Gatsby
|
├── gatsby-node.js # Node-specific hooks for Gatsby
|
||||||
└── package.json # package settings and dependencies
|
└── package.json # package settings and dependencies
|
||||||
```
|
```
|
||||||
|
|
||||||
## Editorial {#editorial}
|
|
||||||
|
|
||||||
- "spaCy" should always be spelled with a lowercase "s" and a capital "C",
|
|
||||||
unless it specifically refers to the Python package or Python import `spacy`
|
|
||||||
(in which case it should be formatted as code).
|
|
||||||
- ✅ spaCy is a library for advanced NLP in Python.
|
|
||||||
- ❌ Spacy is a library for advanced NLP in Python.
|
|
||||||
- ✅ First, you need to install the `spacy` package from pip.
|
|
||||||
- Mentions of code, like function names, classes, variable names etc. in inline
|
|
||||||
text should be formatted as `code`.
|
|
||||||
- ✅ "Calling the `nlp` object on a text returns a `Doc`."
|
|
||||||
- Objects that have pages in the [API docs](/api) should be linked – for
|
|
||||||
example, [`Doc`](/api/doc) or [`Language.to_disk`](/api/language#to_disk). The
|
|
||||||
mentions should still be formatted as code within the link. Links pointing to
|
|
||||||
the API docs will automatically receive a little icon. However, if a paragraph
|
|
||||||
includes many references to the API, the links can easily get messy. In that
|
|
||||||
case, we typically only link the first mention of an object and not any
|
|
||||||
subsequent ones.
|
|
||||||
- ✅ The [`Span`](/api/span) and [`Token`](/api/token) objects are views of a
|
|
||||||
[`Doc`](/api/doc). [`Span.as_doc`](/api/span#as_doc) creates a `Doc` object
|
|
||||||
from a `Span`.
|
|
||||||
- ❌ The [`Span`](/api/span) and [`Token`](/api/token) objects are views of a
|
|
||||||
[`Doc`](/api/doc). [`Span.as_doc`](/api/span#as_doc) creates a
|
|
||||||
[`Doc`](/api/doc) object from a [`Span`](/api/span).
|
|
||||||
|
|
||||||
* Other things we format as code are: references to trained pipeline packages
|
|
||||||
like `en_core_web_sm` or file names like `code.py` or `meta.json`.
|
|
||||||
|
|
||||||
- ✅ After training, the `config.cfg` is saved to disk.
|
|
||||||
|
|
||||||
* [Type annotations](#type-annotations) are a special type of code formatting,
|
|
||||||
expressed by wrapping the text in `~~` instead of backticks. The result looks
|
|
||||||
like this: ~~List[Doc]~~. All references to known types will be linked
|
|
||||||
automatically.
|
|
||||||
|
|
||||||
- ✅ The model has the input type ~~List[Doc]~~ and it outputs a
|
|
||||||
~~List[Array2d]~~.
|
|
||||||
|
|
||||||
* We try to keep links meaningful but short.
|
|
||||||
- ✅ For details, see the usage guide on
|
|
||||||
[training with custom code](/usage/training#custom-code).
|
|
||||||
- ❌ For details, see
|
|
||||||
[the usage guide on training with custom code](/usage/training#custom-code).
|
|
||||||
- ❌ For details, see the usage guide on training with custom code
|
|
||||||
[here](/usage/training#custom-code).
|
|
||||||
|
|
|
@ -1391,12 +1391,13 @@ If the contents are different, the new version of the file is uploaded. Deleting
|
||||||
obsolete files is left up to you.
|
obsolete files is left up to you.
|
||||||
|
|
||||||
Remotes can be defined in the `remotes` section of the
|
Remotes can be defined in the `remotes` section of the
|
||||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project push [remote] [project_dir]
|
$ python -m spacy project push [remote] [project_dir]
|
||||||
|
@ -1435,12 +1436,13 @@ outputs, so if you change the config back, you'll be able to fetch back the
|
||||||
result.
|
result.
|
||||||
|
|
||||||
Remotes can be defined in the `remotes` section of the
|
Remotes can be defined in the `remotes` section of the
|
||||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project pull [remote] [project_dir]
|
$ python -m spacy project pull [remote] [project_dir]
|
||||||
|
|
|
@ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
|
||||||
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||||
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
|
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
|
||||||
|
|
||||||
|
### training.biluo_to_iob {#biluo_to_iob tag="function"}
|
||||||
|
|
||||||
|
Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
|
||||||
|
[IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
|
||||||
|
use the BILUO tags with a model that only supports IOB tags.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import biluo_to_iob
|
||||||
|
>
|
||||||
|
> tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
> iob_tags = biluo_to_iob(tags)
|
||||||
|
> assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------------------------------------------------- |
|
||||||
|
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||||
|
|
||||||
|
### training.iob_to_biluo {#iob_to_biluo tag="function"}
|
||||||
|
|
||||||
|
Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
|
||||||
|
[BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
|
||||||
|
want use the IOB tags with a model that only supports BILUO tags.
|
||||||
|
|
||||||
|
<Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo">
|
||||||
|
|
||||||
|
This method was previously available as `spacy.gold.iob_to_biluo`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import iob_to_biluo
|
||||||
|
>
|
||||||
|
> tags = ["O", "O", "B-LOC", "I-LOC", "O"]
|
||||||
|
> biluo_tags = iob_to_biluo(tags)
|
||||||
|
> assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------- |
|
||||||
|
| `tags` | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||||
|
|
||||||
## Utility functions {#util source="spacy/util.py"}
|
## Utility functions {#util source="spacy/util.py"}
|
||||||
|
|
||||||
spaCy comes with a small collection of utility functions located in
|
spaCy comes with a small collection of utility functions located in
|
||||||
|
|
|
@ -308,14 +308,14 @@ Load state from a binary string.
|
||||||
> assert type(PERSON) == int
|
> assert type(PERSON) == int
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||||
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||||
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||||
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,7 @@ menu:
|
||||||
- ['Typography', 'typography']
|
- ['Typography', 'typography']
|
||||||
- ['Elements', 'elements']
|
- ['Elements', 'elements']
|
||||||
- ['Components', 'components']
|
- ['Components', 'components']
|
||||||
- ['Setup & Installation', 'setup']
|
|
||||||
- ['Markdown Reference', 'markdown']
|
- ['Markdown Reference', 'markdown']
|
||||||
- ['Project Structure', 'structure']
|
|
||||||
- ['Editorial', 'editorial']
|
- ['Editorial', 'editorial']
|
||||||
sidebar:
|
sidebar:
|
||||||
- label: Styleguide
|
- label: Styleguide
|
||||||
|
@ -25,6 +23,610 @@ sidebar:
|
||||||
url: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
url: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
---
|
---
|
||||||
|
|
||||||
import Readme from 'README.md'
|
The [spacy.io](https://spacy.io) website is implemented using
|
||||||
|
[Gatsby](https://www.gatsbyjs.org) with
|
||||||
|
[Remark](https://github.com/remarkjs/remark) and [MDX](https://mdxjs.com/). This
|
||||||
|
allows authoring content in **straightforward Markdown** without the usual
|
||||||
|
limitations. Standard elements can be overwritten with powerful
|
||||||
|
[React](http://reactjs.org/) components and wherever Markdown syntax isn't
|
||||||
|
enough, JSX components can be used.
|
||||||
|
|
||||||
<Readme />
|
> #### Contributing to the site
|
||||||
|
>
|
||||||
|
> The docs can always use another example or more detail, and they should always
|
||||||
|
> be up to date and not misleading. We always appreciate a
|
||||||
|
> [pull request](https://github.com/explosion/spaCy/pulls). To quickly find the
|
||||||
|
> correct file to edit, simply click on the "Suggest edits" button at the bottom
|
||||||
|
> of a page.
|
||||||
|
>
|
||||||
|
> For more details on editing the site locally, see the installation
|
||||||
|
> instructions and markdown reference below.
|
||||||
|
|
||||||
|
## Logo {#logo source="website/src/images/logo.svg"}
|
||||||
|
|
||||||
|
import { Logos } from 'widgets/styleguide'
|
||||||
|
|
||||||
|
If you would like to use the spaCy logo on your site, please get in touch and
|
||||||
|
ask us first. However, if you want to show support and tell others that your
|
||||||
|
project is using spaCy, you can grab one of our
|
||||||
|
[spaCy badges](/usage/spacy-101#faq-project-with-spacy).
|
||||||
|
|
||||||
|
<Logos />
|
||||||
|
|
||||||
|
## Colors {#colors}
|
||||||
|
|
||||||
|
import { Colors, Patterns } from 'widgets/styleguide'
|
||||||
|
|
||||||
|
<Colors />
|
||||||
|
|
||||||
|
### Patterns
|
||||||
|
|
||||||
|
<Patterns />
|
||||||
|
|
||||||
|
## Typography {#typography}
|
||||||
|
|
||||||
|
import { H1, H2, H3, H4, H5, Label, InlineList, Comment } from
|
||||||
|
'components/typography'
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> ## Headline 2
|
||||||
|
> ## Headline 2 {#some_id}
|
||||||
|
> ## Headline 2 {#some_id tag="method"}
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <H2>Headline 2</H2>
|
||||||
|
> <H2 id="some_id">Headline 2</H2>
|
||||||
|
> <H2 id="some_id" tag="method">Headline 2</H2>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Headlines are set in
|
||||||
|
[HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by
|
||||||
|
Hanken Design. All other body text and code uses the best-matching default
|
||||||
|
system font to provide a "native" reading experience. All code uses the
|
||||||
|
[JetBrains Mono](https://www.jetbrains.com/lp/mono/) typeface by JetBrains.
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
Level 2 headings are automatically wrapped in `<section>` elements at compile
|
||||||
|
time, using a custom
|
||||||
|
[Markdown transformer](https://github.com/explosion/spaCy/tree/master/website/plugins/remark-wrap-section.js).
|
||||||
|
This makes it easier to highlight the section that's currently in the viewpoint
|
||||||
|
in the sidebar menu.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<H1>Headline 1</H1>
|
||||||
|
<H2>Headline 2</H2>
|
||||||
|
<H3>Headline 3</H3>
|
||||||
|
<H4>Headline 4</H4>
|
||||||
|
<H5>Headline 5</H5>
|
||||||
|
<Label>Label</Label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
The following optional attributes can be set on the headline to modify it. For
|
||||||
|
example, to add a tag for the documented type or mark features that have been
|
||||||
|
introduced in a specific version or require statistical models to be loaded.
|
||||||
|
Tags are also available as standalone `<Tag />` components.
|
||||||
|
|
||||||
|
| Argument | Example | Result |
|
||||||
|
| -------- | -------------------------- | ----------------------------------------- |
|
||||||
|
| `tag` | `{tag="method"}` | <Tag>method</Tag> |
|
||||||
|
| `new` | `{new="3"}` | <Tag variant="new">3</Tag> |
|
||||||
|
| `model` | `{model="tagger, parser"}` | <Tag variant="model">tagger, parser</Tag> |
|
||||||
|
| `hidden` | `{hidden="true"}` | |
|
||||||
|
|
||||||
|
## Elements {#elements}
|
||||||
|
|
||||||
|
### Links {#links}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown
|
||||||
|
> [I am a link](https://spacy.io)
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <Link to="https://spacy.io">I am a link</Link>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Special link styles are used depending on the link URL.
|
||||||
|
|
||||||
|
- [I am a regular external link](https://explosion.ai)
|
||||||
|
- [I am a link to the documentation](/api/doc)
|
||||||
|
- [I am a link to an architecture](/api/architectures#HashEmbedCNN)
|
||||||
|
- [I am a link to a model](/models/en#en_core_web_sm)
|
||||||
|
- [I am a link to GitHub](https://github.com/explosion/spaCy)
|
||||||
|
|
||||||
|
### Abbreviations {#abbr}
|
||||||
|
|
||||||
|
import { Abbr } from 'components/typography'
|
||||||
|
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <Abbr title="Explanation">Abbreviation</Abbr>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Some text with <Abbr title="Explanation here">an abbreviation</Abbr>. On small
|
||||||
|
screens, I collapse and the explanation text is displayed next to the
|
||||||
|
abbreviation.
|
||||||
|
|
||||||
|
### Tags {#tags}
|
||||||
|
|
||||||
|
import Tag from 'components/tag'
|
||||||
|
|
||||||
|
> ```jsx
|
||||||
|
> <Tag>method</Tag>
|
||||||
|
> <Tag variant="new">4</Tag>
|
||||||
|
> <Tag variant="model">tagger, parser</Tag>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Tags can be used together with headlines, or next to properties across the
|
||||||
|
documentation, and combined with tooltips to provide additional information. An
|
||||||
|
optional `variant` argument can be used for special tags. `variant="new"` makes
|
||||||
|
the tag take a version number to mark new features. Using the component,
|
||||||
|
visibility of this tag can later be toggled once the feature isn't considered
|
||||||
|
new anymore. Setting `variant="model"` takes a description of model capabilities
|
||||||
|
and can be used to mark features that require a respective model to be
|
||||||
|
installed.
|
||||||
|
|
||||||
|
<InlineList>
|
||||||
|
|
||||||
|
<Tag>method</Tag> <Tag variant="new">4</Tag> <Tag variant="model">tagger,
|
||||||
|
parser</Tag>
|
||||||
|
|
||||||
|
</InlineList>
|
||||||
|
|
||||||
|
### Buttons {#buttons}
|
||||||
|
|
||||||
|
import Button from 'components/button'
|
||||||
|
|
||||||
|
> ```jsx
|
||||||
|
> <Button to="#" variant="primary">Primary small</Button>
|
||||||
|
> <Button to="#" variant="secondary">Secondary small</Button>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Link buttons come in two variants, `primary` and `secondary` and two sizes, with
|
||||||
|
an optional `large` size modifier. Since they're mostly used as enhanced links,
|
||||||
|
the buttons are implemented as styled links instead of native button elements.
|
||||||
|
|
||||||
|
<InlineList><Button to="#" variant="primary">Primary small</Button>
|
||||||
|
<Button to="#" variant="secondary">Secondary small</Button></InlineList>
|
||||||
|
|
||||||
|
<br />
|
||||||
|
|
||||||
|
<InlineList><Button to="#" variant="primary" large>Primary large</Button>
|
||||||
|
<Button to="#" variant="secondary" large>Secondary large</Button></InlineList>
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### Table {#table}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> | Header 1 | Header 2 |
|
||||||
|
> | -------- | -------- |
|
||||||
|
> | Column 1 | Column 2 |
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```markup
|
||||||
|
> <Table>
|
||||||
|
> <Tr><Th>Header 1</Th><Th>Header 2</Th></Tr></thead>
|
||||||
|
> <Tr><Td>Column 1</Td><Td>Column 2</Td></Tr>
|
||||||
|
> </Table>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Tables are used to present data and API documentation. Certain keywords can be
|
||||||
|
used to mark a footer row with a distinct style, for example to visualize the
|
||||||
|
return values of a documented function.
|
||||||
|
|
||||||
|
| Header 1 | Header 2 | Header 3 | Header 4 |
|
||||||
|
| ----------- | -------- | :------: | -------: |
|
||||||
|
| Column 1 | Column 2 | Column 3 | Column 4 |
|
||||||
|
| Column 1 | Column 2 | Column 3 | Column 4 |
|
||||||
|
| Column 1 | Column 2 | Column 3 | Column 4 |
|
||||||
|
| Column 1 | Column 2 | Column 3 | Column 4 |
|
||||||
|
| **RETURNS** | Column 2 | Column 3 | Column 4 |
|
||||||
|
|
||||||
|
Tables also support optional "divider" rows that are typically used to denote
|
||||||
|
keyword-only arguments in API documentation. To turn a row into a dividing
|
||||||
|
headline, it should only include content in its first cell, and its value should
|
||||||
|
be italicized:
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> | Header 1 | Header 2 | Header 3 |
|
||||||
|
> | -------- | -------- | -------- |
|
||||||
|
> | Column 1 | Column 2 | Column 3 |
|
||||||
|
> | _Hello_ | | |
|
||||||
|
> | Column 1 | Column 2 | Column 3 |
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Header 1 | Header 2 | Header 3 |
|
||||||
|
| -------- | -------- | -------- |
|
||||||
|
| Column 1 | Column 2 | Column 3 |
|
||||||
|
| _Hello_ | | |
|
||||||
|
| Column 1 | Column 2 | Column 3 |
|
||||||
|
|
||||||
|
### Type Annotations {#type-annotations}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> ~~Model[List[Doc], Floats2d]~~
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```markup
|
||||||
|
> <TypeAnnotation>Model[List[Doc], Floats2d]</Typeannotation>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Type annotations are special inline code blocks are used to describe Python
|
||||||
|
types in the [type hints](https://docs.python.org/3/library/typing.html) format.
|
||||||
|
The special component will split the type, apply syntax highlighting and link
|
||||||
|
all types that specify links in `meta/type-annotations.json`. Types can link to
|
||||||
|
internal or external documentation pages. To make it easy to represent the type
|
||||||
|
annotations in Markdown, the rendering "hijacks" the `~~` tags that would
|
||||||
|
typically be converted to a `<del>` element – but in this case, text surrounded
|
||||||
|
by `~~` becomes a type annotation.
|
||||||
|
|
||||||
|
- ~~Dict[str, List[Union[Doc, Span]]]~~
|
||||||
|
- ~~Model[List[Doc], List[numpy.ndarray]]~~
|
||||||
|
|
||||||
|
Type annotations support a special visual style in tables and will render as a
|
||||||
|
separate row, under the cell text. This allows the API docs to display complex
|
||||||
|
types without taking up too much space in the cell. The type annotation should
|
||||||
|
always be the **last element** in the row.
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> | Header 1 | Header 2 |
|
||||||
|
> | -------- | ----------------------- |
|
||||||
|
> | Column 1 | Column 2 ~~List[Doc]~~ |
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
|
| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
|
|
||||||
|
### List {#list}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> 1. One
|
||||||
|
> 2. Two
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```markup
|
||||||
|
> <Ol>
|
||||||
|
> <Li>One</Li>
|
||||||
|
> <Li>Two</Li>
|
||||||
|
> </Ol>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Lists are available as bulleted and numbered. Markdown lists are transformed
|
||||||
|
automatically.
|
||||||
|
|
||||||
|
- I am a bulleted list
|
||||||
|
- I have nice bullets
|
||||||
|
- Lorem ipsum dolor
|
||||||
|
- consectetur adipiscing elit
|
||||||
|
|
||||||
|
1. I am an ordered list
|
||||||
|
2. I have nice numbers
|
||||||
|
3. Lorem ipsum dolor
|
||||||
|
4. consectetur adipiscing elit
|
||||||
|
|
||||||
|
### Aside {#aside}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ```markdown_
|
||||||
|
> > #### Aside title
|
||||||
|
> > This is aside text.
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <Aside title="Aside title">This is aside text.</Aside>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Asides can be used to display additional notes and content in the right-hand
|
||||||
|
column. Asides can contain text, code and other elements if needed. Visually,
|
||||||
|
asides are moved to the side on the X-axis, and displayed at the same level they
|
||||||
|
were inserted. On small screens, they collapse and are rendered in their
|
||||||
|
original position, in between the text.
|
||||||
|
|
||||||
|
To make them easier to use in Markdown, paragraphs formatted as blockquotes will
|
||||||
|
turn into asides by default. Level 4 headlines (with a leading `####`) will
|
||||||
|
become aside titles.
|
||||||
|
|
||||||
|
### Code Block {#code-block}
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ````markdown_
|
||||||
|
> ```python
|
||||||
|
> ### This is a title
|
||||||
|
> import spacy
|
||||||
|
> ```
|
||||||
|
> ````
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <CodeBlock title="This is a title" lang="python">
|
||||||
|
> import spacy
|
||||||
|
> </CodeBlock>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Code blocks use the [Prism](http://prismjs.com/) syntax highlighter with a
|
||||||
|
custom theme. The language can be set individually on each block, and defaults
|
||||||
|
to raw text with no highlighting. An optional label can be added as the first
|
||||||
|
line with the prefix `####` (Python-like) and `///` (JavaScript-like). the
|
||||||
|
indented block as plain text and preserve whitespace.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Using spaCy
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
doc = nlp("This is a sentence.")
|
||||||
|
for token in doc:
|
||||||
|
print(token.text, token.pos_)
|
||||||
|
```
|
||||||
|
|
||||||
|
Code blocks and also specify an optional range of line numbers to highlight by
|
||||||
|
adding `{highlight="..."}` to the headline. Acceptable ranges are spans like
|
||||||
|
`5-7`, but also `5-7,10` or `5-7,10,13-14`.
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ````markdown_
|
||||||
|
> ```python
|
||||||
|
> ### This is a title {highlight="1-2"}
|
||||||
|
> import spacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> ```
|
||||||
|
> ````
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Using the matcher {highlight="5-7"}
|
||||||
|
import spacy
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
|
||||||
|
matcher.add("HelloWorld", None, pattern)
|
||||||
|
doc = nlp("Hello, world! Hello world!")
|
||||||
|
matches = matcher(doc)
|
||||||
|
```
|
||||||
|
|
||||||
|
Adding `{executable="true"}` to the title turns the code into an executable
|
||||||
|
block, powered by [Binder](https://mybinder.org) and
|
||||||
|
[Juniper](https://github.com/ines/juniper). If JavaScript is disabled, the
|
||||||
|
interactive widget defaults to a regular code block.
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ````markdown_
|
||||||
|
> ```python
|
||||||
|
> ### {executable="true"}
|
||||||
|
> import spacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> ```
|
||||||
|
> ````
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
doc = nlp("This is a sentence.")
|
||||||
|
for token in doc:
|
||||||
|
print(token.text, token.pos_)
|
||||||
|
```
|
||||||
|
|
||||||
|
If a code block only contains a URL to a GitHub file, the raw file contents are
|
||||||
|
embedded automatically and syntax highlighting is applied. The link to the
|
||||||
|
original file is shown at the top of the widget.
|
||||||
|
|
||||||
|
> #### Markdown
|
||||||
|
>
|
||||||
|
> ````markdown_
|
||||||
|
> ```python
|
||||||
|
> https://github.com/...
|
||||||
|
> ```
|
||||||
|
> ````
|
||||||
|
>
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <GitHubCode url="https://github.com/..." lang="python" />
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
https://github.com/explosion/spaCy/tree/master/spacy/language.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Infobox {#infobox}
|
||||||
|
|
||||||
|
import Infobox from 'components/infobox'
|
||||||
|
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <Infobox title="Information">Regular infobox</Infobox>
|
||||||
|
> <Infobox title="Important note" variant="warning">This is a warning.</Infobox>
|
||||||
|
> <Infobox title="Be careful!" variant="danger">This is dangerous.</Infobox>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Infoboxes can be used to add notes, updates, warnings or additional information
|
||||||
|
to a page or section. Semantically, they're implemented and interpreted as an
|
||||||
|
`aside` element. Infoboxes can take an optional `title` argument, as well as an
|
||||||
|
optional `variant` (either `"warning"` or `"danger"`).
|
||||||
|
|
||||||
|
<Infobox title="This is an infobox">
|
||||||
|
|
||||||
|
If needed, an infobox can contain regular text, `inline code`, lists and other
|
||||||
|
blocks.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<Infobox title="This is a warning" variant="warning">
|
||||||
|
|
||||||
|
If needed, an infobox can contain regular text, `inline code`, lists and other
|
||||||
|
blocks.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<Infobox title="This is dangerous" variant="danger">
|
||||||
|
|
||||||
|
If needed, an infobox can contain regular text, `inline code`, lists and other
|
||||||
|
blocks.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
### Accordion {#accordion}
|
||||||
|
|
||||||
|
import Accordion from 'components/accordion'
|
||||||
|
|
||||||
|
> #### JSX
|
||||||
|
>
|
||||||
|
> ```jsx
|
||||||
|
> <Accordion title="This is an accordion">
|
||||||
|
> Accordion content goes here.
|
||||||
|
> </Accordion>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Accordions are collapsible sections that are mostly used for lengthy tables,
|
||||||
|
like the tag and label annotation schemes for different languages. They all need
|
||||||
|
to be presented – but chances are the user doesn't actually care about _all_ of
|
||||||
|
them, especially not at the same time. So it's fairly reasonable to hide them
|
||||||
|
begin a click. This particular implementation was inspired by the amazing
|
||||||
|
[Inclusive Components blog](https://inclusive-components.design/collapsible-sections/).
|
||||||
|
|
||||||
|
<Accordion title="This is an accordion">
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante,
|
||||||
|
pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt
|
||||||
|
nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor
|
||||||
|
gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor,
|
||||||
|
sit amet dignissim justo congue.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
## Markdown reference {#markdown}
|
||||||
|
|
||||||
|
All page content and page meta lives in the `.md` files in the `/docs`
|
||||||
|
directory. The frontmatter block at the top of each file defines the page title
|
||||||
|
and other settings like the sidebar menu.
|
||||||
|
|
||||||
|
````markdown
|
||||||
|
---
|
||||||
|
title: Page title
|
||||||
|
---
|
||||||
|
|
||||||
|
## Headline starting a section {#some_id}
|
||||||
|
|
||||||
|
This is a regular paragraph with a [link](https://spacy.io) and **bold text**.
|
||||||
|
|
||||||
|
> #### This is an aside title
|
||||||
|
>
|
||||||
|
> This is aside text.
|
||||||
|
|
||||||
|
### Subheadline
|
||||||
|
|
||||||
|
| Header 1 | Header 2 |
|
||||||
|
| -------- | -------- |
|
||||||
|
| Column 1 | Column 2 |
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Code block title {highlight="2-3"}
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
doc = nlp("Hello world")
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
This is content in the infobox.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
````
|
||||||
|
|
||||||
|
In addition to the native markdown elements, you can use the components
|
||||||
|
[`<Infobox />`][infobox], [`<Accordion />`][accordion], [`<Abbr />`][abbr] and
|
||||||
|
[`<Tag />`][tag] via their JSX syntax.
|
||||||
|
|
||||||
|
[infobox]: https://spacy.io/styleguide#infobox
|
||||||
|
[accordion]: https://spacy.io/styleguide#accordion
|
||||||
|
[abbr]: https://spacy.io/styleguide#abbr
|
||||||
|
[tag]: https://spacy.io/styleguide#tag
|
||||||
|
|
||||||
|
## Editorial {#editorial}
|
||||||
|
|
||||||
|
- "spaCy" should always be spelled with a lowercase "s" and a capital "C",
|
||||||
|
unless it specifically refers to the Python package or Python import `spacy`
|
||||||
|
(in which case it should be formatted as code).
|
||||||
|
- ✅ spaCy is a library for advanced NLP in Python.
|
||||||
|
- ❌ Spacy is a library for advanced NLP in Python.
|
||||||
|
- ✅ First, you need to install the `spacy` package from pip.
|
||||||
|
- Mentions of code, like function names, classes, variable names etc. in inline
|
||||||
|
text should be formatted as `code`.
|
||||||
|
- ✅ "Calling the `nlp` object on a text returns a `Doc`."
|
||||||
|
- Objects that have pages in the [API docs](/api) should be linked – for
|
||||||
|
example, [`Doc`](/api/doc) or [`Language.to_disk`](/api/language#to_disk). The
|
||||||
|
mentions should still be formatted as code within the link. Links pointing to
|
||||||
|
the API docs will automatically receive a little icon. However, if a paragraph
|
||||||
|
includes many references to the API, the links can easily get messy. In that
|
||||||
|
case, we typically only link the first mention of an object and not any
|
||||||
|
subsequent ones.
|
||||||
|
- ✅ The [`Span`](/api/span) and [`Token`](/api/token) objects are views of a
|
||||||
|
[`Doc`](/api/doc). [`Span.as_doc`](/api/span#as_doc) creates a `Doc` object
|
||||||
|
from a `Span`.
|
||||||
|
- ❌ The [`Span`](/api/span) and [`Token`](/api/token) objects are views of a
|
||||||
|
[`Doc`](/api/doc). [`Span.as_doc`](/api/span#as_doc) creates a
|
||||||
|
[`Doc`](/api/doc) object from a [`Span`](/api/span).
|
||||||
|
|
||||||
|
* Other things we format as code are: references to trained pipeline packages
|
||||||
|
like `en_core_web_sm` or file names like `code.py` or `meta.json`.
|
||||||
|
|
||||||
|
- ✅ After training, the `config.cfg` is saved to disk.
|
||||||
|
|
||||||
|
* [Type annotations](#type-annotations) are a special type of code formatting,
|
||||||
|
expressed by wrapping the text in `~~` instead of backticks. The result looks
|
||||||
|
like this: ~~List[Doc]~~. All references to known types will be linked
|
||||||
|
automatically.
|
||||||
|
|
||||||
|
- ✅ The model has the input type ~~List[Doc]~~ and it outputs a
|
||||||
|
~~List[Array2d]~~.
|
||||||
|
|
||||||
|
* We try to keep links meaningful but short.
|
||||||
|
- ✅ For details, see the usage guide on
|
||||||
|
[training with custom code](/usage/training#custom-code).
|
||||||
|
- ❌ For details, see
|
||||||
|
[the usage guide on training with custom code](/usage/training#custom-code).
|
||||||
|
- ❌ For details, see the usage guide on training with custom code
|
||||||
|
[here](/usage/training#custom-code).
|
||||||
|
|
|
@ -259,9 +259,9 @@ pipelines.
|
||||||
> This can be used in a project command like so:
|
> This can be used in a project command like so:
|
||||||
>
|
>
|
||||||
> ```yaml
|
> ```yaml
|
||||||
> - name: "echo-path"
|
> - name: 'echo-path'
|
||||||
> script:
|
> script:
|
||||||
> - "echo ${env.ENV_PATH}"
|
> - 'echo ${env.ENV_PATH}'
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|
@ -643,12 +643,13 @@ locally.
|
||||||
|
|
||||||
You can list one or more remotes in the `remotes` section of your
|
You can list one or more remotes in the `remotes` section of your
|
||||||
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
||||||
storage. Under the hood, spaCy uses the
|
storage. Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols.
|
||||||
remotes:
|
remotes:
|
||||||
default: 's3://my-spacy-bucket'
|
default: 's3://my-spacy-bucket'
|
||||||
local: '/mnt/scratch/cache'
|
local: '/mnt/scratch/cache'
|
||||||
stuff: 'ssh://myserver.example.com/whatever'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="How it works" emoji="💡">
|
<Infobox title="How it works" emoji="💡">
|
||||||
|
|
|
@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
|
||||||
| Package | Model Version | TAG | Parser LAS | NER F |
|
| Package | Model Version | TAG | Parser LAS | NER F |
|
||||||
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
|
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
|
||||||
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
|
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
|
||||||
| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||||
| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
|
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
|
||||||
|
|
||||||
## Notes about upgrading from v3.3 {#upgrading}
|
## Notes about upgrading from v3.3 {#upgrading}
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@
|
||||||
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1023,25 +1023,6 @@
|
||||||
},
|
},
|
||||||
"category": ["pipeline"]
|
"category": ["pipeline"]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "spacy-sentence-segmenter",
|
|
||||||
"title": "Sentence Segmenter",
|
|
||||||
"slogan": "Custom sentence segmentation for spaCy",
|
|
||||||
"code_example": [
|
|
||||||
"from seg.newline.segmenter import NewLineSegmenter",
|
|
||||||
"import spacy",
|
|
||||||
"",
|
|
||||||
"nlseg = NewLineSegmenter()",
|
|
||||||
"nlp = spacy.load('en')",
|
|
||||||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
|
||||||
"doc = nlp(my_doc_text)"
|
|
||||||
],
|
|
||||||
"author": "tc64",
|
|
||||||
"author_links": {
|
|
||||||
"github": "tc64"
|
|
||||||
},
|
|
||||||
"category": ["pipeline"]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "spacy_cld",
|
"id": "spacy_cld",
|
||||||
"title": "spaCy-CLD",
|
"title": "spaCy-CLD",
|
||||||
|
@ -1468,13 +1449,26 @@
|
||||||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"import scattertext as st",
|
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load('en')",
|
"from scattertext import SampleCorpora, produce_scattertext_explorer",
|
||||||
"corpus = st.CorpusFromPandas(convention_df,",
|
"from scattertext import produce_scattertext_html",
|
||||||
" category_col='party',",
|
"from scattertext.CorpusFromPandas import CorpusFromPandas",
|
||||||
" text_col='text',",
|
"",
|
||||||
" nlp=nlp).build()"
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"convention_df = SampleCorpora.ConventionData2012.get_data()",
|
||||||
|
"corpus = CorpusFromPandas(convention_df,",
|
||||||
|
" category_col='party',",
|
||||||
|
" text_col='text',",
|
||||||
|
" nlp=nlp).build()",
|
||||||
|
"",
|
||||||
|
"html = produce_scattertext_html(corpus,",
|
||||||
|
" category='democrat',",
|
||||||
|
" category_name='Democratic',",
|
||||||
|
" not_category_name='Republican',",
|
||||||
|
" minimum_term_frequency=5,",
|
||||||
|
" width_in_pixels=1000)",
|
||||||
|
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
|
||||||
|
"print('Open ./simple.html in Chrome or Firefox.')"
|
||||||
],
|
],
|
||||||
"author": "Jason Kessler",
|
"author": "Jason Kessler",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
|
|
|
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
|
||||||
|
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
to="https://explosion.ai/spacy-tailored-pipelines"
|
to="https://explosion.ai/custom-solutions"
|
||||||
button="Learn more"
|
button="Learn more"
|
||||||
background="#E4F4F9"
|
background="#E4F4F9"
|
||||||
color="#1e1935"
|
color="#1e1935"
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
|
<Link to="https://explosion.ai/custom-solutions" hidden>
|
||||||
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
||||||
</Link>
|
</Link>
|
||||||
<strong>
|
<strong>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user