From 458bc5f45c5e371bdbef43d58d078436ee496e43 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 8 Aug 2023 15:04:13 +0200 Subject: [PATCH 1/8] Set version to v3.6.1 (#12892) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index cad6158da..0f8eee0ff 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.6.0" +__version__ = "3.6.1" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From d50b8d51e20f4c66ac111e94fdc589e98769c03d Mon Sep 17 00:00:00 2001 From: denizcodeyaa <141595121+denizcodeyaa@users.noreply.github.com> Date: Fri, 11 Aug 2023 09:38:06 -0400 Subject: [PATCH 2/8] Update examples.py (#12895) Add: example sentences to improve the Turkish model. Let's get the tr_web_core_sm out in the the world yaa --- spacy/lang/tr/examples.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index dfb324a4e..c912c950d 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -15,4 +15,7 @@ sentences = [ "Türkiye'nin başkenti neresi?", "Bakanlar Kurulu 180 günlük eylem planını açıkladı.", "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.", + "Cemal Sureya kimdir?", + "Bunlari Biliyor muydunuz?", + "Altinoluk Turkiye haritasinin neresinde yer alir?", ] From 64b8ee2dbe07ad70321a87cc55b653ef335f5c66 Mon Sep 17 00:00:00 2001 From: William Mattingly <62964060+wjbmattingly@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:44:14 +0300 Subject: [PATCH 3/8] Update universe.json (#12904) * Update universe.json added hobbit-spacy to the universe json * Update universe.json removed displacy from hobbit-spacy and added a default text. --- website/meta/universe.json | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 2ed8b4b41..ec380f847 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4444,6 +4444,31 @@ }, "category": ["pipeline", "standalone", "scientific"], "tags": ["ner"] + }, + { + "id": "hobbit-spacy", + "title": "Hobbit spaCy", + "slogan": "NLP for Middle Earth", + "description": "Hobbit spaCy is a custom spaCy pipeline designed specifically for working with Middle Earth and texts from the world of J.R.R. Tolkien.", + "github": "wjbmattingly/hobbit-spacy", + "pip": "en-hobbit", + "code_example": [ + "import spacy", + "", + "nlp = spacy.load('en_hobbit')", + "doc = nlp('Frodo saw Glorfindel and Glóin; and in a corner alone Strider was sitting, clad in his old travel - worn clothes again')" + ], + "code_language": "python", + "thumb": "https://github.com/wjbmattingly/hobbit-spacy/blob/main/images/hobbit-thumbnail.png?raw=true", + "image": "https://github.com/wjbmattingly/hobbit-spacy/raw/main/images/hobbitspacy.png", + "author": "W.J.B. Mattingly", + "author_links": { + "twitter": "wjb_mattingly", + "github": "wjbmattingly", + "website": "https://wjbmattingly.com" + }, + "category": ["pipeline", "standalone"], + "tags": ["spans", "rules", "ner"] } ], From 76a9f9c6c6546ec50cb00fab70dbf5f8ac6e0929 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 16 Aug 2023 17:28:34 +0200 Subject: [PATCH 4/8] Docs: clarify abstract spacy.load examples (#12889) --- website/docs/api/top-level.mdx | 2 +- website/docs/usage/processing-pipelines.mdx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 37e86a4bc..9cdc0c8ab 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -68,7 +68,7 @@ weights, and returns it. cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English nlp = cls() # 2. Initialize it for name in pipeline: - nlp.add_pipe(name) # 3. Add the component to the pipeline + nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline nlp.from_disk(data_path) # 4. Load in the binary data ``` diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx index 307cb9dcb..6ec8a0513 100644 --- a/website/docs/usage/processing-pipelines.mdx +++ b/website/docs/usage/processing-pipelines.mdx @@ -244,7 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the together and returns an instance of `Language` with a pipeline set and access to the binary data: -```python {title="spacy.load under the hood"} +```python {title="spacy.load under the hood (abstract example)"} lang = "en" pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"] data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" @@ -252,7 +252,7 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English nlp = cls() # 2. Initialize it for name in pipeline: - nlp.add_pipe(name) # 3. Add the component to the pipeline + nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline nlp.from_disk(data_path) # 4. Load in the binary data ``` From 6dd56868de3c5e8308ef2ad31d7b63e40a87fe01 Mon Sep 17 00:00:00 2001 From: Connor Brinton Date: Mon, 21 Aug 2023 04:52:32 -0400 Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=93=9D=20Fix=20formula=20for=20recept?= =?UTF-8?q?ive=20field=20in=20docs=20(#12918)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SpaCy's HashEmbedCNN layer performs convolutions over tokens to produce contextualized embeddings using a `MaxoutWindowEncoder` layer. These convolutions are implemented using Thinc's `expand_window` layer, which concatenates `window_size` neighboring sequence items on either side of the sequence item being processed. This is repeated across `depth` convolutional layers. For example, consider the sequence "ABCDE" and a `MaxoutWindowEncoder` layer with a context window of 1 and a depth of 2. We'll focus on the token "C". We can visually represent the contextual embedding produced for "C" as: ```mermaid flowchart LR A0(A0) B0(B0) C0(C0) D0(D0) E0(E0) B1(B1) C1(C1) D1(D1) C2(C2) A0 --> B1 B0 --> B1 C0 --> B1 B0 --> C1 C0 --> C1 D0 --> C1 C0 --> D1 D0 --> D1 E0 --> D1 B1 --> C2 C1 --> C2 D1 --> C2 ``` Described in words, this graph shows that before the first layer of the convolution, the "receptive field" centered at each token consists only of that same token. That is to say, that we have a receptive field of 1. The first layer of the convolution adds one neighboring token on either side to the receptive field. Since this is done on both sides, the receptive field increases by 2, giving the first layer a receptive field of 3. The second layer of the convolutions adds an _additional_ neighboring token on either side to the receptive field, giving a final receptive field of 5. However, this doesn't match the formula currently given in the docs, which read: > The receptive field of the CNN will be > `depth * (window_size * 2 + 1)`, so a 4-layer network with a window > size of `2` will be sensitive to 20 words at a time. Substituting in our depth of 2 and window size of 1, this formula gives us a receptive field of: ``` depth * (window_size * 2 + 1) = 2 * (1 * 2 + 1) = 2 * (2 + 1) = 2 * 3 = 6 ``` This not only doesn't match our computations from above, it's also an even number! This is suspicious, since the receptive field is supposed to be centered on a token, and not between tokens. Generally, this formula results in an even number for any even value of `depth`. The error in this formula is that the adjustment for the center token is multiplied by the depth, when it should occur only once. The corrected formula, `depth * window_size * 2 + 1`, gives the correct value for our small example from above: ``` depth * window_size * 2 + 1 = 2 * 1 * 2 + 1 = 4 + 1 = 5 ``` These changes update the docs to correct the receptive field formula and the example receptive field size. --- spacy/ml/models/tok2vec.py | 4 ++-- website/docs/api/architectures.mdx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 2e9d21ef4..0edc89991 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec( are between 2 and 8. window_size (int): The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be - depth * (window_size * 2 + 1), so a 4-layer network with window_size of - 2 will be sensitive to 20 words at a time. Recommended value is 1. + depth * window_size * 2 + 1, so a 4-layer network with window_size of + 2 will be sensitive to 17 words at a time. Recommended value is 1. embed_size (int): The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between 2000 and 10000. diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index bab24f13b..a292194e9 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -83,7 +83,7 @@ consisting of a CNN and a layer-normalized maxout activation function. | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | -| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ | +| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | From d8a32c1050d2acb4fd121968d7e8780aae0b1382 Mon Sep 17 00:00:00 2001 From: PD Hall <20580126+pdhall99@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:10:58 +0100 Subject: [PATCH 6/8] docs: fix ngram_range_suggester max_size description (#12939) --- website/docs/api/spancategorizer.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index 2b63d31ce..bfe33dfb9 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -521,7 +521,7 @@ has two columns, indicating the start and end position. | Name | Description | | ----------- | ---------------------------------------------------------------------------- | | `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | -| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | +| `max_size` | The maximal phrase lengths to suggest (inclusive). ~~[int]~~ | | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | ### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"} From 52758e1afaa99b2ac47e0ae825f0a86d209952f4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Aug 2023 11:55:23 +0200 Subject: [PATCH 7/8] Add headers to netlify.toml [ci skip] --- website/netlify.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/netlify.toml b/website/netlify.toml index db7ae27c4..a99395918 100644 --- a/website/netlify.toml +++ b/website/netlify.toml @@ -16,3 +16,9 @@ NETLIFY_NEXT_PLUGIN_SKIP = "true" [[plugins]] package = "@netlify/plugin-nextjs" + +[[headers]] + for = "/*" + [headers.values] + X-Frame-Options = "DENY" + X-XSS-Protection = "1; mode=block" From 3e4264899c3b12f8eabc5cd700146177a34824d0 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 30 Aug 2023 11:58:14 +0200 Subject: [PATCH 8/8] Update large-language-models.mdx (#12944) --- website/docs/api/large-language-models.mdx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index cc8328790..94b426cc8 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -893,7 +893,7 @@ OpenAI's `davinci` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Davinci.v1 " +> @llm_models = "spacy.Davinci.v1" > name = "davinci" > config = {"temperature": 0.3} > ``` @@ -914,7 +914,7 @@ OpenAI's `curie` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Curie.v1 " +> @llm_models = "spacy.Curie.v1" > name = "curie" > config = {"temperature": 0.3} > ``` @@ -935,7 +935,7 @@ OpenAI's `babbage` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Babbage.v1 " +> @llm_models = "spacy.Babbage.v1" > name = "babbage" > config = {"temperature": 0.3} > ``` @@ -956,7 +956,7 @@ OpenAI's `ada` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Ada.v1 " +> @llm_models = "spacy.Ada.v1" > name = "ada" > config = {"temperature": 0.3} > ``` @@ -977,7 +977,7 @@ Cohere's `command` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Command.v1 " +> @llm_models = "spacy.Command.v1" > name = "command" > config = {"temperature": 0.3} > ``` @@ -998,7 +998,7 @@ Anthropic's `claude-2` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-2.v1 " +> @llm_models = "spacy.Claude-2.v1" > name = "claude-2" > config = {"temperature": 0.3} > ``` @@ -1019,7 +1019,7 @@ Anthropic's `claude-1` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-1.v1 " +> @llm_models = "spacy.Claude-1.v1" > name = "claude-1" > config = {"temperature": 0.3} > ``` @@ -1040,7 +1040,7 @@ Anthropic's `claude-instant-1` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-instant-1.v1 " +> @llm_models = "spacy.Claude-instant-1.v1" > name = "claude-instant-1" > config = {"temperature": 0.3} > ``` @@ -1061,7 +1061,7 @@ Anthropic's `claude-instant-1.1` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-instant-1-1.v1 " +> @llm_models = "spacy.Claude-instant-1-1.v1" > name = "claude-instant-1.1" > config = {"temperature": 0.3} > ``` @@ -1082,7 +1082,7 @@ Anthropic's `claude-1.0` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-1-0.v1 " +> @llm_models = "spacy.Claude-1-0.v1" > name = "claude-1.0" > config = {"temperature": 0.3} > ``` @@ -1124,7 +1124,7 @@ Anthropic's `claude-1.3` model family. > > ```ini > [components.llm.model] -> @llm_models = "spacy.Claude-1-3.v1 " +> @llm_models = "spacy.Claude-1-3.v1" > name = "claude-1.3" > config = {"temperature": 0.3} > ```