mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'master' into docs/llm
This commit is contained in:
commit
40ae30dc5a
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.6.0"
|
__version__ = "3.6.1"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -15,4 +15,7 @@ sentences = [
|
||||||
"Türkiye'nin başkenti neresi?",
|
"Türkiye'nin başkenti neresi?",
|
||||||
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
|
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
|
||||||
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
|
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
|
||||||
|
"Cemal Sureya kimdir?",
|
||||||
|
"Bunlari Biliyor muydunuz?",
|
||||||
|
"Altinoluk Turkiye haritasinin neresinde yer alir?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
are between 2 and 8.
|
are between 2 and 8.
|
||||||
window_size (int): The number of tokens on either side to concatenate during
|
window_size (int): The number of tokens on either side to concatenate during
|
||||||
the convolutions. The receptive field of the CNN will be
|
the convolutions. The receptive field of the CNN will be
|
||||||
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
|
depth * window_size * 2 + 1, so a 4-layer network with window_size of
|
||||||
2 will be sensitive to 20 words at a time. Recommended value is 1.
|
2 will be sensitive to 17 words at a time. Recommended value is 1.
|
||||||
embed_size (int): The number of rows in the hash embedding tables. This can
|
embed_size (int): The number of rows in the hash embedding tables. This can
|
||||||
be surprisingly small, due to the use of the hash embeddings. Recommended
|
be surprisingly small, due to the use of the hash embeddings. Recommended
|
||||||
values are between 2000 and 10000.
|
values are between 2000 and 10000.
|
||||||
|
|
|
@ -83,7 +83,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
|
||||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
|
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||||
|
|
|
@ -521,7 +521,7 @@ has two columns, indicating the start and end position.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------- |
|
| ----------- | ---------------------------------------------------------------------------- |
|
||||||
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
||||||
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
|
| `max_size` | The maximal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
||||||
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
|
|
||||||
### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
|
### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
|
||||||
|
|
|
@ -68,7 +68,7 @@ weights, and returns it.
|
||||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # 2. Initialize it
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline
|
||||||
nlp.from_disk(data_path) # 4. Load in the binary data
|
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -244,7 +244,7 @@ tagging pipeline. This is also why the pipeline state is always held by the
|
||||||
together and returns an instance of `Language` with a pipeline set and access to
|
together and returns an instance of `Language` with a pipeline set and access to
|
||||||
the binary data:
|
the binary data:
|
||||||
|
|
||||||
```python {title="spacy.load under the hood"}
|
```python {title="spacy.load under the hood (abstract example)"}
|
||||||
lang = "en"
|
lang = "en"
|
||||||
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
|
pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
|
||||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||||
|
@ -252,7 +252,7 @@ data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||||
nlp = cls() # 2. Initialize it
|
nlp = cls() # 2. Initialize it
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(name) # 3. Add the component to the pipeline
|
nlp.add_pipe(name, config={...}) # 3. Add the component to the pipeline
|
||||||
nlp.from_disk(data_path) # 4. Load in the binary data
|
nlp.from_disk(data_path) # 4. Load in the binary data
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -4444,6 +4444,31 @@
|
||||||
},
|
},
|
||||||
"category": ["pipeline", "standalone", "scientific"],
|
"category": ["pipeline", "standalone", "scientific"],
|
||||||
"tags": ["ner"]
|
"tags": ["ner"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hobbit-spacy",
|
||||||
|
"title": "Hobbit spaCy",
|
||||||
|
"slogan": "NLP for Middle Earth",
|
||||||
|
"description": "Hobbit spaCy is a custom spaCy pipeline designed specifically for working with Middle Earth and texts from the world of J.R.R. Tolkien.",
|
||||||
|
"github": "wjbmattingly/hobbit-spacy",
|
||||||
|
"pip": "en-hobbit",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load('en_hobbit')",
|
||||||
|
"doc = nlp('Frodo saw Glorfindel and Glóin; and in a corner alone Strider was sitting, clad in his old travel - worn clothes again')"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"thumb": "https://github.com/wjbmattingly/hobbit-spacy/blob/main/images/hobbit-thumbnail.png?raw=true",
|
||||||
|
"image": "https://github.com/wjbmattingly/hobbit-spacy/raw/main/images/hobbitspacy.png",
|
||||||
|
"author": "W.J.B. Mattingly",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "wjb_mattingly",
|
||||||
|
"github": "wjbmattingly",
|
||||||
|
"website": "https://wjbmattingly.com"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "standalone"],
|
||||||
|
"tags": ["spans", "rules", "ner"]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
||||||
|
|
|
@ -16,3 +16,9 @@ NETLIFY_NEXT_PLUGIN_SKIP = "true"
|
||||||
|
|
||||||
[[plugins]]
|
[[plugins]]
|
||||||
package = "@netlify/plugin-nextjs"
|
package = "@netlify/plugin-nextjs"
|
||||||
|
|
||||||
|
[[headers]]
|
||||||
|
for = "/*"
|
||||||
|
[headers.values]
|
||||||
|
X-Frame-Options = "DENY"
|
||||||
|
X-XSS-Protection = "1; mode=block"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user