Merge branch 'master' into docs/llm

This commit is contained in:
svlandeg 2023-08-31 11:54:30 +02:00
commit 40ae30dc5a
10 changed files with 53 additions and 19 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.6.0" __version__ = "3.6.1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -15,4 +15,7 @@ sentences = [
"Türkiye'nin başkenti neresi?", "Türkiye'nin başkenti neresi?",
"Bakanlar Kurulu 180 günlük eylem planınııkladı.", "Bakanlar Kurulu 180 günlük eylem planınııkladı.",
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.", "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
"Cemal Sureya kimdir?",
"Bunlari Biliyor muydunuz?",
"Altinoluk Turkiye haritasinin neresinde yer alir?",
] ]

View File

@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec(
are between 2 and 8. are between 2 and 8.
window_size (int): The number of tokens on either side to concatenate during window_size (int): The number of tokens on either side to concatenate during
the convolutions. The receptive field of the CNN will be the convolutions. The receptive field of the CNN will be
depth * (window_size * 2 + 1), so a 4-layer network with window_size of depth * window_size * 2 + 1, so a 4-layer network with window_size of
2 will be sensitive to 20 words at a time. Recommended value is 1. 2 will be sensitive to 17 words at a time. Recommended value is 1.
embed_size (int): The number of rows in the hash embedding tables. This can embed_size (int): The number of rows in the hash embedding tables. This can
be surprisingly small, due to the use of the hash embeddings. Recommended be surprisingly small, due to the use of the hash embeddings. Recommended
values are between 2000 and 10000. values are between 2000 and 10000.

View File

@ -83,7 +83,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ | | `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |

View File

@ -521,7 +521,7 @@ has two columns, indicating the start and end position.
| Name | Description | | Name | Description |
| ----------- | ---------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------- |
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | | `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | | `max_size` | The maximal phrase lengths to suggest (inclusive). ~~[int]~~ |
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | | **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"} ### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}

View File

@ -68,7 +68,7 @@ weights, and returns it.
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
nlp = cls() # 2. Initialize it nlp = cls() # 2. Initialize it
for name in pipeline: for name in pipeline:
nlp.add_pipe(name) # 3. Add the component to the pipeline nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline
nlp.from_disk(data_path) # 4. Load in the binary data nlp.from_disk(data_path) # 4. Load in the binary data
``` ```

View File

@ -244,7 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the
together and returns an instance of `Language` with a pipeline set and access to together and returns an instance of `Language` with a pipeline set and access to
the binary data: the binary data:
```python {title="spacy.load under the hood"} ```python {title="spacy.load under the hood (abstract example)"}
lang = "en" lang = "en"
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"] pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
@ -252,7 +252,7 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
nlp = cls() # 2. Initialize it nlp = cls() # 2. Initialize it
for name in pipeline: for name in pipeline:
nlp.add_pipe(name) # 3. Add the component to the pipeline nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline
nlp.from_disk(data_path) # 4. Load in the binary data nlp.from_disk(data_path) # 4. Load in the binary data
``` ```

View File

@ -4444,6 +4444,31 @@
}, },
"category": ["pipeline", "standalone", "scientific"], "category": ["pipeline", "standalone", "scientific"],
"tags": ["ner"] "tags": ["ner"]
},
{
"id": "hobbit-spacy",
"title": "Hobbit spaCy",
"slogan": "NLP for Middle Earth",
"description": "Hobbit spaCy is a custom spaCy pipeline designed specifically for working with Middle Earth and texts from the world of J.R.R. Tolkien.",
"github": "wjbmattingly/hobbit-spacy",
"pip": "en-hobbit",
"code_example": [
"import spacy",
"",
"nlp = spacy.load('en_hobbit')",
"doc = nlp('Frodo saw Glorfindel and Glóin; and in a corner alone Strider was sitting, clad in his old travel - worn clothes again')"
],
"code_language": "python",
"thumb": "https://github.com/wjbmattingly/hobbit-spacy/blob/main/images/hobbit-thumbnail.png?raw=true",
"image": "https://github.com/wjbmattingly/hobbit-spacy/raw/main/images/hobbitspacy.png",
"author": "W.J.B. Mattingly",
"author_links": {
"twitter": "wjb_mattingly",
"github": "wjbmattingly",
"website": "https://wjbmattingly.com"
},
"category": ["pipeline", "standalone"],
"tags": ["spans", "rules", "ner"]
} }
], ],

View File

@ -16,3 +16,9 @@ NETLIFY_NEXT_PLUGIN_SKIP = "true"
[[plugins]] [[plugins]]
package = "@netlify/plugin-nextjs" package = "@netlify/plugin-nextjs"
[[headers]]
for = "/*"
[headers.values]
X-Frame-Options = "DENY"
X-XSS-Protection = "1; mode=block"