mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 12:20:20 +03:00
Merge branch 'master' into feature/3-5
This commit is contained in:
commit
d0fd2ea5ea
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache
|
||||
website/public
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
|
|
|
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb_in_memory
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
|
|
|
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
|||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 20% of the pattern string length
|
||||
# to 30% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
|
9
website/.dockerignore
Normal file
9
website/.dockerignore
Normal file
|
@ -0,0 +1,9 @@
|
|||
.cache/
|
||||
.next/
|
||||
public/
|
||||
node_modules
|
||||
.npm
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
2
website/.gitignore
vendored
2
website/.gitignore
vendored
|
@ -1,5 +1,7 @@
|
|||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
quickstart-training-generator.js
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
|
|
|
@ -1,16 +1,14 @@
|
|||
FROM node:11.15.0
|
||||
FROM node:18
|
||||
|
||||
WORKDIR /spacy-io
|
||||
|
||||
RUN npm install -g gatsby-cli@2.7.4
|
||||
|
||||
COPY package.json .
|
||||
COPY package-lock.json .
|
||||
|
||||
RUN npm install
|
||||
USER node
|
||||
|
||||
# This is so the installed node_modules will be up one directory
|
||||
# from where a user mounts files, so that they don't accidentally mount
|
||||
# their own node_modules from a different build
|
||||
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
|
||||
WORKDIR /spacy-io/website/
|
||||
WORKDIR /home/node
|
||||
COPY --chown=node package.json .
|
||||
COPY --chown=node package-lock.json .
|
||||
RUN npm install
|
||||
|
||||
WORKDIR /home/node/website/
|
||||
|
|
|
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
|
|||
`node_modules` folder**, since there are some dependencies that need to be built
|
||||
for the image system. Rename it before using.
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
-v $(pwd):/spacy-io/website \
|
||||
-p 8000:8000 \
|
||||
ghcr.io/explosion/spacy-io \
|
||||
gatsby develop -H 0.0.0.0
|
||||
```
|
||||
|
||||
This will allow you to access the built website at http://0.0.0.0:8000/ in your
|
||||
browser, and still edit code in your editor while having the site reflect those
|
||||
changes.
|
||||
|
||||
**Note**: If you're working on a Mac with an M1 processor, you might see
|
||||
segfault errors from `qemu` if you use the default image. To fix this use the
|
||||
`arm64` tagged image in the `docker run` command
|
||||
(ghcr.io/explosion/spacy-io:arm64).
|
||||
|
||||
### Building the Docker image
|
||||
|
||||
If you'd like to build the image locally, you can do so like this:
|
||||
First build the Docker image. This only needs to be done on the first run
|
||||
or when changes are made to `Dockerfile` or the website dependencies:
|
||||
|
||||
```bash
|
||||
docker build -t spacy-io .
|
||||
```
|
||||
|
||||
This will take some time, so if you want to use the prebuilt image you'll save a
|
||||
bit of time.
|
||||
You can then build and run the website with:
|
||||
|
||||
```bash
|
||||
docker run -it \
|
||||
--rm \
|
||||
-v $(pwd):/home/node/website \
|
||||
-p 3000:3000 \
|
||||
spacy-io \
|
||||
npm run dev -- -H 0.0.0.0
|
||||
```
|
||||
|
||||
This will allow you to access the built website at http://0.0.0.0:3000/ in your
|
||||
browser, and still edit code in your editor while having the site reflect those
|
||||
changes.
|
||||
|
||||
## Project structure
|
||||
|
||||
|
|
|
@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
|
|||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
| Name | Description |
|
||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
|
||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
|
|||
plausible candidates from that `KnowledgeBase` given a certain textual mention,
|
||||
and a machine learning model to pick the right candidate, given the local
|
||||
context of the mention. `EntityLinker` defaults to using the
|
||||
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
|
||||
|
||||
## Assigned Attributes {id="assigned-attributes"}
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
|
|||
|
||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||
vector, which should be of length
|
||||
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
|
||||
[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
|
|||
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||
identifiers and their prior probabilities. The entity identifiers should refer
|
||||
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
|
||||
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
|
||||
to entities previously added with
|
||||
[`add_entity`](/api/inmemorylookupkb#add_entity) or
|
||||
[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
|
||||
probabilities should not exceed 1. Note that an empty string can not be used as
|
||||
alias.
|
||||
|
||||
|
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
|
|||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
|||
|
||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||
|
||||
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
|
||||
Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
|
||||
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
|
||||
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
|
||||
parameter `candidates_batch_size` is greater or equal than 1.
|
||||
|
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
|||
|
||||
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
|
||||
|
||||
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
|
||||
Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
|
||||
number of entity IDs.
|
||||
|
||||
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
|
|
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
|
|||
<Infobox variant="warning">
|
||||
|
||||
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
|
||||
implementation up to that point is available as `InMemoryLookupKB` from 3.5
|
||||
onwards.
|
||||
implementation up to that point is available as
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -110,14 +110,15 @@ to you.
|
|||
</Infobox>
|
||||
|
||||
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
|
||||
more flexibility in customizing knowledge bases. Some of its methods were moved
|
||||
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
|
||||
being `get_alias_candidates()`. This method is now available as
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
|
||||
allow more flexibility in customizing knowledge bases. Some of its methods were
|
||||
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
|
||||
one of those being `get_alias_candidates()`. This method is now available as
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
Note:
|
||||
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
|
||||
defaults to
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
|
|
|
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
|||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
|
||||
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||
| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
|
||||
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||
|
|
|
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
|
|||
allowed edit distance directly.
|
||||
|
||||
```python
|
||||
# Match lowercase with fuzzy matching (allows 2 edits)
|
||||
# Match lowercase with fuzzy matching (allows 3 edits)
|
||||
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
|
||||
|
||||
# Match custom attribute values with fuzzy matching (allows 2 edits)
|
||||
# Match custom attribute values with fuzzy matching (allows 3 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
|
||||
|
||||
# Match with exact Levenshtein edit distance limits (allows 3 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
|
||||
# Match with exact Levenshtein edit distance limits (allows 4 edits)
|
||||
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
|
||||
```
|
||||
|
||||
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
|
||||
|
|
|
@ -684,10 +684,15 @@ If your pipeline includes
|
|||
[custom components](/usage/processing-pipelines#custom-components), model
|
||||
architectures or other [code](/usage/training#custom-code), those functions need
|
||||
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
||||
how to create the objects referenced in the config. The
|
||||
[`spacy package`](/api/cli#package) command lets you provide one or more paths
|
||||
to Python files containing custom registered functions using the `--code`
|
||||
argument.
|
||||
how to create the objects referenced in the config. If you're loading your own
|
||||
pipeline in Python, you can make custom components available just by importing
|
||||
the code that defines them before calling
|
||||
[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
|
||||
argument to CLI commands works.
|
||||
|
||||
With the [`spacy package`](/api/cli#package) command, you can provide one or
|
||||
more paths to Python files containing custom registered functions using the
|
||||
`--code` argument.
|
||||
|
||||
> #### \_\_init\_\_.py (excerpt)
|
||||
>
|
||||
|
|
|
@ -130,6 +130,7 @@
|
|||
"items": [
|
||||
{ "text": "Attributes", "url": "/api/attributes" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" },
|
||||
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
||||
|
|
|
@ -89,7 +89,7 @@ const Landing = () => {
|
|||
</LandingCard>
|
||||
|
||||
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
|
||||
In the five years since its release, spaCy has become an industry standard with
|
||||
Since its release in 2015, spaCy has become an industry standard with
|
||||
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
|
||||
learning stack and build custom components and workflows.
|
||||
</LandingCard>
|
||||
|
|
Loading…
Reference in New Issue
Block a user