Merge branch 'master' into docs/llm

2025-08-05 04:40:20 +03:00 · 2023-08-31 11:54:30 +02:00 · 2023-08-31 11:54:30 +02:00 · 40ae30dc5a
commit 40ae30dc5a
parent 8379057c4c 3e4264899c
10 changed files with 53 additions and 19 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.6.0"
+__version__ = "3.6.1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@ -15,4 +15,7 @@ sentences = [
    "Türkiye'nin başkenti neresi?",
    "Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
    "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
+    "Cemal Sureya kimdir?",
+    "Bunlari Biliyor muydunuz?",
+    "Altinoluk Turkiye haritasinin neresinde yer alir?",
 ]
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec(
        are between 2 and 8.
    window_size (int): The number of tokens on either side to concatenate during
        the convolutions. The receptive field of the CNN will be
-        depth * (window_size * 2 + 1), so a 4-layer network with window_size of
-        2 will be sensitive to 20 words at a time. Recommended value is 1.
+        depth * window_size * 2 + 1, so a 4-layer network with window_size of
+        2 will be sensitive to 17 words at a time. Recommended value is 1.
    embed_size (int): The number of rows in the hash embedding tables. This can
        be surprisingly small, due to the use of the hash embeddings. Recommended
        values are between 2000 and 10000.
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@ -83,7 +83,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          |
 | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                |
 | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            |
-| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
+| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
 | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   |
 | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       |
 | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  |
--- a/website/docs/api/large-language-models.mdx
+++ b/website/docs/api/large-language-models.mdx
@ -893,7 +893,7 @@ OpenAI's `davinci` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Davinci.v1 "
+> @llm_models = "spacy.Davinci.v1"
 > name = "davinci"
 > config = {"temperature": 0.3}
 > ```
@ -914,7 +914,7 @@ OpenAI's `curie` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Curie.v1 "
+> @llm_models = "spacy.Curie.v1"
 > name = "curie"
 > config = {"temperature": 0.3}
 > ```
@ -935,7 +935,7 @@ OpenAI's `babbage` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Babbage.v1 "
+> @llm_models = "spacy.Babbage.v1"
 > name = "babbage"
 > config = {"temperature": 0.3}
 > ```
@ -956,7 +956,7 @@ OpenAI's `ada` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Ada.v1 "
+> @llm_models = "spacy.Ada.v1"
 > name = "ada"
 > config = {"temperature": 0.3}
 > ```
@ -977,7 +977,7 @@ Cohere's `command` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Command.v1 "
+> @llm_models = "spacy.Command.v1"
 > name = "command"
 > config = {"temperature": 0.3}
 > ```
@ -998,7 +998,7 @@ Anthropic's `claude-2` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-2.v1 "
+> @llm_models = "spacy.Claude-2.v1"
 > name = "claude-2"
 > config = {"temperature": 0.3}
 > ```
@ -1019,7 +1019,7 @@ Anthropic's `claude-1` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-1.v1 "
+> @llm_models = "spacy.Claude-1.v1"
 > name = "claude-1"
 > config = {"temperature": 0.3}
 > ```
@ -1040,7 +1040,7 @@ Anthropic's `claude-instant-1` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-instant-1.v1 "
+> @llm_models = "spacy.Claude-instant-1.v1"
 > name = "claude-instant-1"
 > config = {"temperature": 0.3}
 > ```
@ -1061,7 +1061,7 @@ Anthropic's `claude-instant-1.1` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-instant-1-1.v1 "
+> @llm_models = "spacy.Claude-instant-1-1.v1"
 > name = "claude-instant-1.1"
 > config = {"temperature": 0.3}
 > ```
@ -1082,7 +1082,7 @@ Anthropic's `claude-1.0` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-1-0.v1 "
+> @llm_models = "spacy.Claude-1-0.v1"
 > name = "claude-1.0"
 > config = {"temperature": 0.3}
 > ```
@ -1124,7 +1124,7 @@ Anthropic's `claude-1.3` model family.
 >
 > ```ini
 > [components.llm.model]
-> @llm_models = "spacy.Claude-1-3.v1 "
+> @llm_models = "spacy.Claude-1-3.v1"
 > name = "claude-1.3"
 > config = {"temperature": 0.3}
 > ```
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@ -521,7 +521,7 @@ has two columns, indicating the start and end position.
 | Name        | Description                                                                  |
 | ----------- | ---------------------------------------------------------------------------- |
 | `min_size`  | The minimal phrase lengths to suggest (inclusive). ~~[int]~~                 |
-| `max_size`  | The maximal phrase lengths to suggest (exclusive). ~~[int]~~                 |
+| `max_size`  | The maximal phrase lengths to suggest (inclusive). ~~[int]~~                 |
 | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |

 ### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -68,7 +68,7 @@ weights, and returns it.
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
 for name in pipeline:
-    nlp.add_pipe(name)                 # 3. Add the component to the pipeline
+    nlp.add_pipe(name, config={...})   # 3. Add the component to the pipeline
 nlp.from_disk(data_path)               # 4. Load in the binary data
 ```

--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@ -244,7 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the
 together and returns an instance of `Language` with a pipeline set and access to
 the binary data:

-```python {title="spacy.load under the hood"}
+```python {title="spacy.load under the hood (abstract example)"}
 lang = "en"
 pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
 data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
@ -252,7 +252,7 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
 for name in pipeline:
-    nlp.add_pipe(name)                 # 3. Add the component to the pipeline
+    nlp.add_pipe(name, config={...})   # 3. Add the component to the pipeline
 nlp.from_disk(data_path)               # 4. Load in the binary data
 ```

--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -4444,6 +4444,31 @@
            },
            "category": ["pipeline", "standalone", "scientific"],
            "tags": ["ner"]
+        },
+        {
+            "id": "hobbit-spacy",
+            "title": "Hobbit spaCy",
+            "slogan": "NLP for Middle Earth",
+            "description": "Hobbit spaCy is a custom spaCy pipeline designed specifically for working with Middle Earth and texts from the world of J.R.R. Tolkien.",
+            "github": "wjbmattingly/hobbit-spacy",
+            "pip": "en-hobbit",
+            "code_example": [
+                "import spacy",
+                "",
+                "nlp = spacy.load('en_hobbit')",
+                "doc = nlp('Frodo saw Glorfindel and Glóin; and in a corner alone Strider was sitting, clad in his old travel - worn clothes again')"
+            ],
+            "code_language": "python",
+            "thumb": "https://github.com/wjbmattingly/hobbit-spacy/blob/main/images/hobbit-thumbnail.png?raw=true",
+            "image": "https://github.com/wjbmattingly/hobbit-spacy/raw/main/images/hobbitspacy.png",
+            "author": "W.J.B. Mattingly",
+            "author_links": {
+                "twitter": "wjb_mattingly",
+                "github": "wjbmattingly",
+                "website": "https://wjbmattingly.com"
+            },
+            "category": ["pipeline", "standalone"],
+            "tags": ["spans", "rules", "ner"]
        }
    ],

--- a/website/netlify.toml
+++ b/website/netlify.toml
@ -16,3 +16,9 @@ NETLIFY_NEXT_PLUGIN_SKIP = "true"

 [[plugins]]
 package = "@netlify/plugin-nextjs"
+
+[[headers]]
+  for = "/*"
+  [headers.values]
+    X-Frame-Options = "DENY"
+    X-XSS-Protection = "1; mode=block"