Merge branch 'master' into feature/3-5

This commit is contained in:
Adriane Boyd 2023-01-19 14:04:41 +01:00 committed by GitHub
commit d0fd2ea5ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 84 additions and 83 deletions

10
.gitignore vendored
View File

@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt spacy/tests/package/requirements.txt
# Website
website/.cache
website/public
website/node_modules
website/.npm
website/logs
*.log
npm-debug.log*
quickstart-training-generator.js
# Cython / C extensions # Cython / C extensions
cythonize.json cythonize.json
spacy/*.html spacy/*.html

View File

@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts. to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb_in_memory DOCS: https://spacy.io/api/inmemorylookupkb
""" """
def __init__(self, Vocab vocab, entity_vector_length): def __init__(self, Vocab vocab, entity_vector_length):

View File

@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
max_edits = fuzzy max_edits = fuzzy
else: else:
# allow at least two edits (to allow at least one transposition) and up # allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length # to 30% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text))) max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits return levenshtein(input_text, pattern_text, max_edits) <= max_edits

9
website/.dockerignore Normal file
View File

@ -0,0 +1,9 @@
.cache/
.next/
public/
node_modules
.npm
logs
*.log
npm-debug.log*
quickstart-training-generator.js

2
website/.gitignore vendored
View File

@ -1,5 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
quickstart-training-generator.js
# dependencies # dependencies
/node_modules /node_modules
/.pnp /.pnp

View File

@ -1,16 +1,14 @@
FROM node:11.15.0 FROM node:18
WORKDIR /spacy-io USER node
RUN npm install -g gatsby-cli@2.7.4
COPY package.json .
COPY package-lock.json .
RUN npm install
# This is so the installed node_modules will be up one directory # This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount # from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build # their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
WORKDIR /spacy-io/website/ WORKDIR /home/node
COPY --chown=node package.json .
COPY --chown=node package-lock.json .
RUN npm install
WORKDIR /home/node/website/

View File

@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
`node_modules` folder**, since there are some dependencies that need to be built `node_modules` folder**, since there are some dependencies that need to be built
for the image system. Rename it before using. for the image system. Rename it before using.
```bash First build the Docker image. This only needs to be done on the first run
docker run -it \ or when changes are made to `Dockerfile` or the website dependencies:
-v $(pwd):/spacy-io/website \
-p 8000:8000 \
ghcr.io/explosion/spacy-io \
gatsby develop -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:8000/ in your
browser, and still edit code in your editor while having the site reflect those
changes.
**Note**: If you're working on a Mac with an M1 processor, you might see
segfault errors from `qemu` if you use the default image. To fix this use the
`arm64` tagged image in the `docker run` command
(ghcr.io/explosion/spacy-io:arm64).
### Building the Docker image
If you'd like to build the image locally, you can do so like this:
```bash ```bash
docker build -t spacy-io . docker build -t spacy-io .
``` ```
This will take some time, so if you want to use the prebuilt image you'll save a You can then build and run the website with:
bit of time.
```bash
docker run -it \
--rm \
-v $(pwd):/home/node/website \
-p 3000:3000 \
spacy-io \
npm run dev -- -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:3000/ in your
browser, and still edit code in your editor while having the site reflect those
changes.
## Project structure ## Project structure

View File

@ -1216,11 +1216,11 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
``` ```
| Name | Description | | Name | Description |
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | | `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ | | `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | | `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | | `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |

View File

@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention, plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local and a machine learning model to pick the right candidate, given the local
context of the mention. `EntityLinker` defaults to using the context of the mention. `EntityLinker` defaults to using the
[`InMemoryLookupKB`](/api/kb_in_memory) implementation. [`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
## Assigned Attributes {id="assigned-attributes"} ## Assigned Attributes {id="assigned-attributes"}

View File

@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
Add an entity to the knowledge base, specifying its corpus frequency and entity Add an entity to the knowledge base, specifying its corpus frequency and entity
vector, which should be of length vector, which should be of length
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length). [`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
> #### Example > #### Example
> >
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
Add an alias or mention to the knowledge base, specifying its potential KB Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity) to entities previously added with
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior [`add_entity`](/api/inmemorylookupkb#add_entity) or
[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
probabilities should not exceed 1. Note that an empty string can not be used as probabilities should not exceed 1. Note that an empty string can not be used as
alias. alias.
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
Given a certain textual mention as input, retrieve a list of candidate entities Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). Wraps of type [`Candidate`](/api/kb#candidate). Wraps
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
> #### Example > #### Example
> >
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
will call `get_candidates_batch()` instead of `get_candidates()`, if the config will call `get_candidates_batch()` instead of `get_candidates()`, if the config
parameter `candidates_batch_size` is greater or equal than 1. parameter `candidates_batch_size` is greater or equal than 1.
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"} ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
number of entity IDs. number of entity IDs.
The default implementation of `get_vectors()` executes `get_vector()` in a loop. The default implementation of `get_vectors()` executes `get_vector()` in a loop.

View File

@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
<Infobox variant="warning"> <Infobox variant="warning">
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase` This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
implementation up to that point is available as `InMemoryLookupKB` from 3.5 implementation up to that point is available as
onwards. [`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
</Infobox> </Infobox>
@ -110,14 +110,15 @@ to you.
</Infobox> </Infobox>
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow [`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
more flexibility in customizing knowledge bases. Some of its methods were moved allow more flexibility in customizing knowledge bases. Some of its methods were
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
being `get_alias_candidates()`. This method is now available as one of those being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates) Note:
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to defaults to
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"} ## KnowledgeBase.get_vector {id="get_vector",tag="method"}

View File

@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | | [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. | | [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. | | [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. | | [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | | [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. | | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |

View File

@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
allowed edit distance directly. allowed edit distance directly.
```python ```python
# Match lowercase with fuzzy matching (allows 2 edits) # Match lowercase with fuzzy matching (allows 3 edits)
pattern = [{"LOWER": {"FUZZY": "definitely"}}] pattern = [{"LOWER": {"FUZZY": "definitely"}}]
# Match custom attribute values with fuzzy matching (allows 2 edits) # Match custom attribute values with fuzzy matching (allows 3 edits)
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
# Match with exact Levenshtein edit distance limits (allows 3 edits) # Match with exact Levenshtein edit distance limits (allows 4 edits)
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}] pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
``` ```
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"} #### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}

View File

@ -684,10 +684,15 @@ If your pipeline includes
[custom components](/usage/processing-pipelines#custom-components), model [custom components](/usage/processing-pipelines#custom-components), model
architectures or other [code](/usage/training#custom-code), those functions need architectures or other [code](/usage/training#custom-code), those functions need
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
how to create the objects referenced in the config. The how to create the objects referenced in the config. If you're loading your own
[`spacy package`](/api/cli#package) command lets you provide one or more paths pipeline in Python, you can make custom components available just by importing
to Python files containing custom registered functions using the `--code` the code that defines them before calling
argument. [`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
argument to CLI commands works.
With the [`spacy package`](/api/cli#package) command, you can provide one or
more paths to Python files containing custom registered functions using the
`--code` argument.
> #### \_\_init\_\_.py (excerpt) > #### \_\_init\_\_.py (excerpt)
> >

View File

@ -130,6 +130,7 @@
"items": [ "items": [
{ "text": "Attributes", "url": "/api/attributes" }, { "text": "Attributes", "url": "/api/attributes" },
{ "text": "Corpus", "url": "/api/corpus" }, { "text": "Corpus", "url": "/api/corpus" },
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
{ "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "Lookups", "url": "/api/lookups" }, { "text": "Lookups", "url": "/api/lookups" },
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" }, { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },

View File

@ -89,7 +89,7 @@ const Landing = () => {
</LandingCard> </LandingCard>
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more"> <LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
In the five years since its release, spaCy has become an industry standard with Since its release in 2015, spaCy has become an industry standard with
a huge ecosystem. Choose from a variety of plugins, integrate with your machine a huge ecosystem. Choose from a variety of plugins, integrate with your machine
learning stack and build custom components and workflows. learning stack and build custom components and workflows.
</LandingCard> </LandingCard>