Merge branch 'master' into docs/llm

2025-09-08 05:15:04 +03:00 · 2023-08-31 11:54:30 +02:00 · 2023-08-31 11:54:30 +02:00 · 40ae30dc5a
commit 40ae30dc5a
parent 8379057c4c 3e4264899c
10 changed files with 53 additions and 19 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.6.0"
+__version__ = "3.6.1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@ -15,4 +15,7 @@ sentences = [
    "Türkiye'nin başkenti neresi?",
    "Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
    "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
    "Cemal Sureya kimdir?",
    "Bunlari Biliyor muydunuz?",
    "Altinoluk Turkiye haritasinin neresinde yer alir?",
 ]
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec(
        are between 2 and 8.
    window_size (int): The number of tokens on either side to concatenate during
        the convolutions. The receptive field of the CNN will be
-        depth * (window_size * 2 + 1), so a 4-layer network with window_size of
+        depth * window_size * 2 + 1, so a 4-layer network with window_size of
-        2 will be sensitive to 20 words at a time. Recommended value is 1.
+        2 will be sensitive to 17 words at a time. Recommended value is 1.
    embed_size (int): The number of rows in the hash embedding tables. This can
        be surprisingly small, due to the use of the hash embeddings. Recommended
        values are between 2000 and 10000.
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@ -83,7 +83,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          |
 | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                |
 | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            |
-| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
+| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
 | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   |
 | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       |
 | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  |
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@ -521,7 +521,7 @@ has two columns, indicating the start and end position.
 | Name        | Description                                                                  |
 | ----------- | ---------------------------------------------------------------------------- |
 | `min_size`  | The minimal phrase lengths to suggest (inclusive). ~~[int]~~                 |
-| `max_size`  | The maximal phrase lengths to suggest (exclusive). ~~[int]~~                 |
+| `max_size`  | The maximal phrase lengths to suggest (inclusive). ~~[int]~~                 |
 | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
 ### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -68,7 +68,7 @@ weights, and returns it.
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
 for name in pipeline:
-    nlp.add_pipe(name)                 # 3. Add the component to the pipeline
+    nlp.add_pipe(name, config={...})   # 3. Add the component to the pipeline
 nlp.from_disk(data_path)               # 4. Load in the binary data
 ```
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@ -244,7 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the
 together and returns an instance of `Language` with a pipeline set and access to
 the binary data:
-```python {title="spacy.load under the hood"}
+```python {title="spacy.load under the hood (abstract example)"}
 lang = "en"
 pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
 data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
@ -252,7 +252,7 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
 for name in pipeline:
-    nlp.add_pipe(name)                 # 3. Add the component to the pipeline
+    nlp.add_pipe(name, config={...})   # 3. Add the component to the pipeline
 nlp.from_disk(data_path)               # 4. Load in the binary data
 ```
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -4444,6 +4444,31 @@
            },
            "category": ["pipeline", "standalone", "scientific"],
            "tags": ["ner"]
        },
        {
            "id": "hobbit-spacy",
            "title": "Hobbit spaCy",
            "slogan": "NLP for Middle Earth",
            "description": "Hobbit spaCy is a custom spaCy pipeline designed specifically for working with Middle Earth and texts from the world of J.R.R. Tolkien.",
            "github": "wjbmattingly/hobbit-spacy",
            "pip": "en-hobbit",
            "code_example": [
                "import spacy",
                "",
                "nlp = spacy.load('en_hobbit')",
                "doc = nlp('Frodo saw Glorfindel and Glóin; and in a corner alone Strider was sitting, clad in his old travel - worn clothes again')"
            ],
            "code_language": "python",
            "thumb": "https://github.com/wjbmattingly/hobbit-spacy/blob/main/images/hobbit-thumbnail.png?raw=true",
            "image": "https://github.com/wjbmattingly/hobbit-spacy/raw/main/images/hobbitspacy.png",
            "author": "W.J.B. Mattingly",
            "author_links": {
                "twitter": "wjb_mattingly",
                "github": "wjbmattingly",
                "website": "https://wjbmattingly.com"
            },
            "category": ["pipeline", "standalone"],
            "tags": ["spans", "rules", "ner"]
        }
    ],
--- a/website/netlify.toml
+++ b/website/netlify.toml
@ -16,3 +16,9 @@ NETLIFY_NEXT_PLUGIN_SKIP = "true"
 [[plugins]]
 package = "@netlify/plugin-nextjs"
 [[headers]]
  for = "/*"
  [headers.values]
    X-Frame-Options = "DENY"
    X-XSS-Protection = "1; mode=block"