Merge branch 'master' into feature/3-5

2025-09-21 03:22:37 +03:00 · 2023-01-19 14:04:41 +01:00 · 2023-01-19 14:04:41 +01:00 · d0fd2ea5ea
commit d0fd2ea5ea
parent a3e69504a8 3b8918e166
16 changed files with 84 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
 # Website
 website/.cache
 website/public
 website/node_modules
 website/.npm
 website/logs
 *.log
 npm-debug.log*
 quickstart-training-generator.js
 # Cython / C extensions
 cythonize.json
 spacy/*.html
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
    to support entity linking of named entities to real-world concepts.
-    DOCS: https://spacy.io/api/kb_in_memory
+    DOCS: https://spacy.io/api/inmemorylookupkb
    """
    def __init__(self, Vocab vocab, entity_vector_length):
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
        max_edits = fuzzy
    else:
        # allow at least two edits (to allow at least one transposition) and up
-        # to 20% of the pattern string length
+        # to 30% of the pattern string length
        max_edits = max(2, round(0.3 * len(pattern_text)))
    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
--- a/website/.dockerignore
+++ b/website/.dockerignore
@ -0,0 +1,9 @@
 .cache/
 .next/
 public/
 node_modules
 .npm
 logs
 *.log
 npm-debug.log*
 quickstart-training-generator.js
--- a/website/.gitignore
+++ b/website/.gitignore
@ -1,5 +1,7 @@
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 quickstart-training-generator.js
 # dependencies
 /node_modules
 /.pnp
--- a/website/Dockerfile
+++ b/website/Dockerfile
@ -1,16 +1,14 @@
-FROM node:11.15.0 
+FROM node:18
-WORKDIR /spacy-io
+USER node
 RUN npm install -g gatsby-cli@2.7.4
 COPY package.json .
 COPY package-lock.json . 
 RUN npm install
 # This is so the installed node_modules will be up one directory
 # from where a user mounts files, so that they don't accidentally mount
 # their own node_modules from a different build
 # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /spacy-io/website/
+WORKDIR /home/node
 COPY --chown=node package.json .
 COPY --chown=node package-lock.json .
 RUN npm install
 WORKDIR /home/node/website/
--- a/website/README.md
+++ b/website/README.md
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
 `node_modules` folder**, since there are some dependencies that need to be built
 for the image system. Rename it before using.
-```bash
+First build the Docker image. This only needs to be done on the first run
-docker run -it \
+or when changes are made to `Dockerfile` or the website dependencies:
  -v $(pwd):/spacy-io/website \
  -p 8000:8000 \
  ghcr.io/explosion/spacy-io \
  gatsby develop -H 0.0.0.0
 ```
 This will allow you to access the built website at http://0.0.0.0:8000/ in your
 browser, and still edit code in your editor while having the site reflect those
 changes.
 **Note**: If you're working on a Mac with an M1 processor, you might see
 segfault errors from `qemu` if you use the default image. To fix this use the
 `arm64` tagged image in the `docker run` command
 (ghcr.io/explosion/spacy-io:arm64).
 ### Building the Docker image
 If you'd like to build the image locally, you can do so like this:
 ```bash
 docker build -t spacy-io .
 ```
-This will take some time, so if you want to use the prebuilt image you'll save a
+You can then build and run the website with:
-bit of time.
+
 ```bash
 docker run -it \
  --rm \
  -v $(pwd):/home/node/website \
  -p 3000:3000 \
  spacy-io \
  npm run dev -- -H 0.0.0.0
 ```
 This will allow you to access the built website at http://0.0.0.0:3000/ in your
 browser, and still edit code in your editor while having the site reflect those
 changes.
 ## Project structure
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1216,11 +1216,11 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 ```
 | Name                      | Description                                                                                                                                                                          |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
 | `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
 | `output-file`, `-o`       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
 | `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
 | `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
 plausible candidates from that `KnowledgeBase` given a certain textual mention,
 and a machine learning model to pick the right candidate, given the local
 context of the mention. `EntityLinker` defaults to using the
-[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
 ## Assigned Attributes {id="assigned-attributes"}
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
 Add an entity to the knowledge base, specifying its corpus frequency and entity
 vector, which should be of length
-[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
 > #### Example
 >
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
 Add an alias or mention to the knowledge base, specifying its potential KB
 identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
+to entities previously added with
-or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+[`add_entity`](/api/inmemorylookupkb#add_entity) or
 [`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
 probabilities should not exceed 1. Note that an empty string can not be used as
 alias.
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
 Given a certain textual mention as input, retrieve a list of candidate entities
 of type [`Candidate`](/api/kb#candidate). Wraps
-[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 > #### Example
 >
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
-Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
 arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
 will call `get_candidates_batch()` instead of `get_candidates()`, if the config
 parameter `candidates_batch_size` is greater or equal than 1.
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
 ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
-Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
 number of entity IDs.
 The default implementation of `get_vectors()` executes `get_vector()` in a loop.
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
 <Infobox variant="warning">
 This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
-implementation up to that point is available as `InMemoryLookupKB` from 3.5
+implementation up to that point is available as
-onwards.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
 </Infobox>
@ -110,14 +110,15 @@ to you.
 </Infobox>
 From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
+[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-more flexibility in customizing knowledge bases. Some of its methods were moved
+allow more flexibility in customizing knowledge bases. Some of its methods were
-to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
+moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-being `get_alias_candidates()`. This method is now available as
+one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+Note:
 [`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
 defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
 | [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
 | [`KnowledgeBase`](/api/kb)                       | Abstract base class for storage and retrieval of data for entity linking.                          |
-| [`InMemoryLookupKB`](/api/kb_in_memory)          | Implementation of `KnowledgeBase` storing all data in memory.                                      |
+| [`InMemoryLookupKB`](/api/inmemorylookupkb)      | Implementation of `KnowledgeBase` storing all data in memory.                                      |
 | [`Candidate`](/api/kb#candidate)                 | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`.        |
 | [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
 | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
 allowed edit distance directly.
 ```python
-# Match lowercase with fuzzy matching (allows 2 edits)
+# Match lowercase with fuzzy matching (allows 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
-# Match custom attribute values with fuzzy matching (allows 2 edits)
+# Match custom attribute values with fuzzy matching (allows 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
-# Match with exact Levenshtein edit distance limits (allows 3 edits)
+# Match with exact Levenshtein edit distance limits (allows 4 edits)
-pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```
 #### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@ -684,10 +684,15 @@ If your pipeline includes
 [custom components](/usage/processing-pipelines#custom-components), model
 architectures or other [code](/usage/training#custom-code), those functions need
 to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
-how to create the objects referenced in the config. The
+how to create the objects referenced in the config. If you're loading your own
-[`spacy package`](/api/cli#package) command lets you provide one or more paths
+pipeline in Python, you can make custom components available just by importing
-to Python files containing custom registered functions using the `--code`
+the code that defines them before calling
-argument.
+[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
 argument to CLI commands works.
 With the [`spacy package`](/api/cli#package) command, you can provide one or
 more paths to Python files containing custom registered functions using the
 `--code` argument.
 > #### \_\_init\_\_.py (excerpt)
 >
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -130,6 +130,7 @@
                "items": [
                    { "text": "Attributes", "url": "/api/attributes" },
                    { "text": "Corpus", "url": "/api/corpus" },
                    { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
                    { "text": "KnowledgeBase", "url": "/api/kb" },
                    { "text": "Lookups", "url": "/api/lookups" },
                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
--- a/website/pages/index.tsx
+++ b/website/pages/index.tsx
@ -89,7 +89,7 @@ const Landing = () => {
                </LandingCard>
                <LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
-                    In the five years since its release, spaCy has become an industry standard with
+                    Since its release in 2015, spaCy has become an industry standard with
                    a huge ecosystem. Choose from a variety of plugins, integrate with your machine
                    learning stack and build custom components and workflows.
                </LandingCard>