mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Merge branch 'master' into bugfix/fix-morph-memory-zone
This commit is contained in:
commit
2676746efa
|
@ -35,7 +35,7 @@ so that more people can benefit from it.
|
||||||
|
|
||||||
When opening an issue, use a **descriptive title** and include your
|
When opening an issue, use a **descriptive title** and include your
|
||||||
**environment** (operating system, Python version, spaCy version). Our
|
**environment** (operating system, Python version, spaCy version). Our
|
||||||
[issue template](https://github.com/explosion/spaCy/issues/new) helps you
|
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
|
||||||
remember the most important details to include. If you've discovered a bug, you
|
remember the most important details to include. If you've discovered a bug, you
|
||||||
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||||
opening an issue to report the bug, simply refer to your pull request in the
|
opening an issue to report the bug, simply refer to your pull request in the
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||||
Reldi-tagger is licesned under the Apache 2.0 licence.
|
Reldi-tagger is licensed under the Apache 2.0 licence.
|
||||||
|
|
||||||
@InProceedings{ljubesic16-new,
|
@InProceedings{ljubesic16-new,
|
||||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||||
|
|
|
@ -890,6 +890,28 @@ when loading a config with
|
||||||
| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ |
|
| `pipe_name` | Name of pipeline component to replace listeners for. ~~str~~ |
|
||||||
| `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |
|
| `listeners` | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |
|
||||||
|
|
||||||
|
## Language.memory_zone {id="memory_zone",tag="contextmanager",version="3.8"}
|
||||||
|
|
||||||
|
Begin a block where all resources allocated during the block will be freed at
|
||||||
|
the end of it. If a resources was created within the memory zone block,
|
||||||
|
accessing it outside the block is invalid. Behavior of this invalid access is
|
||||||
|
undefined. Memory zones should not be nested. The memory zone is helpful for
|
||||||
|
services that need to process large volumes of text with a defined memory budget.
|
||||||
|
|
||||||
|
> ```python
|
||||||
|
> ### Example
|
||||||
|
> counts = Counter()
|
||||||
|
> with nlp.memory_zone():
|
||||||
|
> for doc in nlp.pipe(texts):
|
||||||
|
> for token in doc:
|
||||||
|
> counts[token.text] += 1
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| --- | --- |
|
||||||
|
| `mem` | Optional `cymem.Pool` object to own allocations (created if not provided). This argument is not required for ordinary usage. Defaults to `None`. ~~Optional[cymem.Pool]~~ |
|
||||||
|
| **RETURNS** | The memory pool that owns the allocations. This object is not required for ordinary usage. ~~Iterator[cymem.Pool]~~ |
|
||||||
|
|
||||||
## Language.meta {id="meta",tag="property"}
|
## Language.meta {id="meta",tag="property"}
|
||||||
|
|
||||||
Meta data for the `Language` class, including name, version, data sources,
|
Meta data for the `Language` class, including name, version, data sources,
|
||||||
|
|
|
@ -1597,7 +1597,7 @@ The name of the model to be used has to be passed in via the `name` attribute.
|
||||||
|
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
| -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | The name of a mdodel supported by LangChain for this API. ~~str~~ |
|
| `name` | The name of a model supported by LangChain for this API. ~~str~~ |
|
||||||
| `config` | Configuration passed on to the LangChain model. Defaults to `{}`. ~~Dict[Any, Any]~~ |
|
| `config` | Configuration passed on to the LangChain model. Defaults to `{}`. ~~Dict[Any, Any]~~ |
|
||||||
| `query` | Function that executes the prompts. If `None`, defaults to `spacy.CallLangChain.v1`. ~~Optional[Callable[["langchain.llms.BaseLLM", Iterable[Any]], Iterable[Any]]]~~ |
|
| `query` | Function that executes the prompts. If `None`, defaults to `spacy.CallLangChain.v1`. ~~Optional[Callable[["langchain.llms.BaseLLM", Iterable[Any]], Iterable[Any]]]~~ |
|
||||||
|
|
||||||
|
|
131
website/docs/usage/memory-management.mdx
Normal file
131
website/docs/usage/memory-management.mdx
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
---
|
||||||
|
title: Memory Management
|
||||||
|
teaser: Managing Memory for persistent services
|
||||||
|
version: 3.8
|
||||||
|
menu:
|
||||||
|
- ['Memory Zones', 'memoryzones']
|
||||||
|
- ['Clearing Doc attributes', 'doc-attrs']
|
||||||
|
---
|
||||||
|
|
||||||
|
spaCy maintains a few internal caches that improve speed,
|
||||||
|
but cause memory to increase slightly over time. If you're
|
||||||
|
running a batch process that you don't need to be long-lived,
|
||||||
|
the increase in memory usage generally isn't a problem.
|
||||||
|
However, if you're running spaCy inside a web service, you'll
|
||||||
|
often want spaCy's memory usage to stay consistent. Transformer
|
||||||
|
models can also run into memory problems sometimes, especially when
|
||||||
|
used on a GPU.
|
||||||
|
|
||||||
|
## Memory zones {id="memoryzones"}
|
||||||
|
|
||||||
|
You can tell spaCy to free data from its internal caches (especially the
|
||||||
|
[`Vocab`](/api/vocab)) using the [`Language.memory_zone`](/api/language#memory_zone) context manager. Enter
|
||||||
|
the contextmanager and process your text within it, and spaCy will
|
||||||
|
**reset its internal caches** (freeing up the associated memory) at the
|
||||||
|
end of the block. spaCy objects created inside the memory zone must
|
||||||
|
not be accessed once the memory zone is finished.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Using memory zones
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
def count_words(nlp, texts):
|
||||||
|
counts = Counter()
|
||||||
|
with nlp.memory_zone():
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
for token in doc:
|
||||||
|
counts[token.text] += 1
|
||||||
|
return counts
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
Exiting the memory-zone invalidates all `Doc`, `Token`, `Span` and `Lexeme`
|
||||||
|
objects that were created within it. If you access these objects
|
||||||
|
after the memory zone exits, you may encounter a segmentation fault
|
||||||
|
due to invalid memory access.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
spaCy needs the memory zone contextmanager because the processing pipeline
|
||||||
|
can't keep track of which [`Doc`](/api/doc) objects are referring to data in the shared
|
||||||
|
[`Vocab`](/api/vocab) cache. For instance, when spaCy encounters a new word, a new [`Lexeme`](/api/lexeme)
|
||||||
|
entry is stored in the `Vocab`, and the `Doc` object points to this shared
|
||||||
|
data. When the `Doc` goes out of scope, the `Vocab` has no way of knowing that
|
||||||
|
this `Lexeme` is no longer in use.
|
||||||
|
|
||||||
|
The memory zone solves this problem by
|
||||||
|
allowing you to tell the processing pipeline that all data created
|
||||||
|
between two points is no longer in use. It is up to the you to honor
|
||||||
|
this agreement. If you access objects that are supposed to no longer be in
|
||||||
|
use, you may encounter a segmentation fault due to invalid memory access.
|
||||||
|
|
||||||
|
A common use case for memory zones will be **within a web service**. The processing
|
||||||
|
pipeline can be loaded once, either as a context variable or a global, and each
|
||||||
|
request can be handled within a memory zone:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Memory zones with FastAPI {highlight="10,23"}
|
||||||
|
from fastapi import FastAPI, APIRouter, Depends, Request
|
||||||
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
def make_app():
|
||||||
|
app = FastAPI()
|
||||||
|
app.state.NLP = spacy.load("en_core_web_sm")
|
||||||
|
app.include_router(router)
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
def get_nlp(request: Request) -> Language:
|
||||||
|
return request.app.state.NLP
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/parse")
|
||||||
|
def parse_texts(
|
||||||
|
*, text_batch: list[str], nlp: Language = Depends(get_nlp)
|
||||||
|
) -> list[dict]:
|
||||||
|
with nlp.memory_zone():
|
||||||
|
# Put the spaCy call within a separate function, so we can't
|
||||||
|
# leak the Doc objects outside the scope of the memory zone.
|
||||||
|
output = _process_text(nlp, text_batch)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _process_text(nlp: Language, texts: list[str]) -> list[dict]:
|
||||||
|
# Call spaCy, and transform the output into our own data
|
||||||
|
# structures. This function is called from inside a memory
|
||||||
|
# zone, so must not return the spaCy objects.
|
||||||
|
docs = list(nlp.pipe(texts))
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"tokens": [{"text": t.text} for t in doc],
|
||||||
|
"entities": [
|
||||||
|
{"start": e.start, "end": e.end, "label": e.label_} for e in doc.ents
|
||||||
|
],
|
||||||
|
}
|
||||||
|
for doc in docs
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
app = make_app()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Clearing transformer tensors and other Doc attributes {id="doc-attrs"}
|
||||||
|
|
||||||
|
The [`Transformer`](/api/transformer) and [`Tok2Vec`](/api/tok2vec) components set intermediate values onto the `Doc`
|
||||||
|
object during parsing. This can cause GPU memory to be exhausted if many `Doc`
|
||||||
|
objects are kept in memory together.
|
||||||
|
|
||||||
|
To resolve this, you can add the [`doc_cleaner`](/api/pipeline-functions#doc_cleaner) component to your pipeline. By default
|
||||||
|
this will clean up the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute and the [`Doc.tensor`](/api/doc#attributes) attribute.
|
||||||
|
You can have it clean up other intermediate extension attributes you use in custom
|
||||||
|
pipeline components as well.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Adding the doc_cleaner
|
||||||
|
nlp.add_pipe("doc_cleaner", config={"attrs": {"tensor": None}})
|
||||||
|
```
|
|
@ -720,7 +720,7 @@ matches = matcher(doc)
|
||||||
|
|
||||||
# Serve visualization of sentences containing match with displaCy
|
# Serve visualization of sentences containing match with displaCy
|
||||||
# set manual=True to make displaCy render straight from a dictionary
|
# set manual=True to make displaCy render straight from a dictionary
|
||||||
# (if you're not running the code within a Jupyer environment, you can
|
# (if you're not running the code within a Jupyter environment, you can
|
||||||
# use displacy.serve instead)
|
# use displacy.serve instead)
|
||||||
displacy.render(matched_sents, style="ent", manual=True)
|
displacy.render(matched_sents, style="ent", manual=True)
|
||||||
```
|
```
|
||||||
|
|
|
@ -5,45 +5,96 @@
|
||||||
{
|
{
|
||||||
"label": "Get started",
|
"label": "Get started",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Installation", "url": "/usage" },
|
{
|
||||||
{ "text": "Models & Languages", "url": "/usage/models" },
|
"text": "Installation",
|
||||||
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
|
"url": "/usage"
|
||||||
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
|
},
|
||||||
{ "text": "New in v3.7", "url": "/usage/v3-7" },
|
{
|
||||||
{ "text": "New in v3.6", "url": "/usage/v3-6" },
|
"text": "Models & Languages",
|
||||||
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
"url": "/usage/models"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Facts & Figures",
|
||||||
|
"url": "/usage/facts-figures"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "spaCy 101",
|
||||||
|
"url": "/usage/spacy-101"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "New in v3.7",
|
||||||
|
"url": "/usage/v3-7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "New in v3.6",
|
||||||
|
"url": "/usage/v3-6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "New in v3.5",
|
||||||
|
"url": "/usage/v3-5"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Guides",
|
"label": "Guides",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Linguistic Features", "url": "/usage/linguistic-features" },
|
{
|
||||||
{ "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
|
"text": "Linguistic Features",
|
||||||
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
"url": "/usage/linguistic-features"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Rule-based Matching",
|
||||||
|
"url": "/usage/rule-based-matching"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Processing Pipelines",
|
||||||
|
"url": "/usage/processing-pipelines"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"text": "Embeddings & Transformers",
|
"text": "Embeddings & Transformers",
|
||||||
"url": "/usage/embeddings-transformers"
|
"url": "/usage/embeddings-transformers"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"text": "Large Language Models",
|
"text": "Large Language Models",
|
||||||
"url": "/usage/large-language-models",
|
"url": "/usage/large-language-models"
|
||||||
"tag": "new"
|
},
|
||||||
|
{
|
||||||
|
"text": "Training Models",
|
||||||
|
"url": "/usage/training"
|
||||||
},
|
},
|
||||||
{ "text": "Training Models", "url": "/usage/training" },
|
|
||||||
{
|
{
|
||||||
"text": "Layers & Model Architectures",
|
"text": "Layers & Model Architectures",
|
||||||
"url": "/usage/layers-architectures"
|
"url": "/usage/layers-architectures"
|
||||||
},
|
},
|
||||||
{ "text": "spaCy Projects", "url": "/usage/projects" },
|
{
|
||||||
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
"text": "spaCy Projects",
|
||||||
{ "text": "Visualizers", "url": "/usage/visualizers" }
|
"url": "/usage/projects"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Saving & Loading",
|
||||||
|
"url": "/usage/saving-loading"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Memory Management",
|
||||||
|
"url": "/usage/memory-management"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Visualizers",
|
||||||
|
"url": "/usage/visualizers"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Resources",
|
"label": "Resources",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Project Templates", "url": "https://github.com/explosion/projects" },
|
{
|
||||||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
"text": "Project Templates",
|
||||||
|
"url": "https://github.com/explosion/projects"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "v2.x Documentation",
|
||||||
|
"url": "https://v2.spacy.io"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"text": "Custom Solutions",
|
"text": "Custom Solutions",
|
||||||
"url": "https://explosion.ai/custom-solutions"
|
"url": "https://explosion.ai/custom-solutions"
|
||||||
|
@ -57,7 +108,12 @@
|
||||||
"items": [
|
"items": [
|
||||||
{
|
{
|
||||||
"label": "Models",
|
"label": "Models",
|
||||||
"items": [{ "text": "Overview", "url": "/models" }]
|
"items": [
|
||||||
|
{
|
||||||
|
"text": "Overview",
|
||||||
|
"url": "/models"
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Trained Pipelines",
|
"label": "Trained Pipelines",
|
||||||
|
@ -71,91 +127,261 @@
|
||||||
{
|
{
|
||||||
"label": "Overview",
|
"label": "Overview",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Library Architecture", "url": "/api" },
|
{
|
||||||
{ "text": "Model Architectures", "url": "/api/architectures" },
|
"text": "Library Architecture",
|
||||||
{ "text": "Data Formats", "url": "/api/data-formats" },
|
"url": "/api"
|
||||||
{ "text": "Command Line", "url": "/api/cli" },
|
},
|
||||||
{ "text": "Functions", "url": "/api/top-level" }
|
{
|
||||||
|
"text": "Model Architectures",
|
||||||
|
"url": "/api/architectures"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Data Formats",
|
||||||
|
"url": "/api/data-formats"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Command Line",
|
||||||
|
"url": "/api/cli"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Functions",
|
||||||
|
"url": "/api/top-level"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Containers",
|
"label": "Containers",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Doc", "url": "/api/doc" },
|
{
|
||||||
{ "text": "DocBin", "url": "/api/docbin" },
|
"text": "Doc",
|
||||||
{ "text": "Example", "url": "/api/example" },
|
"url": "/api/doc"
|
||||||
{ "text": "Language", "url": "/api/language" },
|
},
|
||||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
{
|
||||||
{ "text": "Span", "url": "/api/span" },
|
"text": "DocBin",
|
||||||
{ "text": "SpanGroup", "url": "/api/spangroup" },
|
"url": "/api/docbin"
|
||||||
{ "text": "Token", "url": "/api/token" }
|
},
|
||||||
|
{
|
||||||
|
"text": "Example",
|
||||||
|
"url": "/api/example"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Language",
|
||||||
|
"url": "/api/language"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Lexeme",
|
||||||
|
"url": "/api/lexeme"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Span",
|
||||||
|
"url": "/api/span"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SpanGroup",
|
||||||
|
"url": "/api/spangroup"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Token",
|
||||||
|
"url": "/api/token"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Pipeline",
|
"label": "Pipeline",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
|
{
|
||||||
{ "text": "CoreferenceResolver", "url": "/api/coref" },
|
"text": "AttributeRuler",
|
||||||
{ "text": "CuratedTransformer", "url": "/api/curatedtransformer" },
|
"url": "/api/attributeruler"
|
||||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
},
|
||||||
{ "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
|
{
|
||||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
"text": "CoreferenceResolver",
|
||||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
"url": "/api/coref"
|
||||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
},
|
||||||
{ "text": "Large Language Models", "url": "/api/large-language-models" },
|
{
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
"text": "CuratedTransformer",
|
||||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
"url": "/api/curatedtransformer"
|
||||||
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
},
|
||||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
{
|
||||||
{ "text": "SpanCategorizer", "url": "/api/spancategorizer" },
|
"text": "DependencyParser",
|
||||||
{ "text": "SpanFinder", "url": "/api/spanfinder" },
|
"url": "/api/dependencyparser"
|
||||||
{ "text": "SpanResolver", "url": "/api/span-resolver" },
|
},
|
||||||
{ "text": "SpanRuler", "url": "/api/spanruler" },
|
{
|
||||||
{ "text": "Tagger", "url": "/api/tagger" },
|
"text": "EditTreeLemmatizer",
|
||||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
"url": "/api/edittreelemmatizer"
|
||||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
},
|
||||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
{
|
||||||
{ "text": "TrainablePipe", "url": "/api/pipe" },
|
"text": "EntityLinker",
|
||||||
{ "text": "Transformer", "url": "/api/transformer" },
|
"url": "/api/entitylinker"
|
||||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" }
|
},
|
||||||
|
{
|
||||||
|
"text": "EntityRecognizer",
|
||||||
|
"url": "/api/entityrecognizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "EntityRuler",
|
||||||
|
"url": "/api/entityruler"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Large Language Models",
|
||||||
|
"url": "/api/large-language-models"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Lemmatizer",
|
||||||
|
"url": "/api/lemmatizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Morphologizer",
|
||||||
|
"url": "/api/morphologizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SentenceRecognizer",
|
||||||
|
"url": "/api/sentencerecognizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Sentencizer",
|
||||||
|
"url": "/api/sentencizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SpanCategorizer",
|
||||||
|
"url": "/api/spancategorizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SpanFinder",
|
||||||
|
"url": "/api/spanfinder"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SpanResolver",
|
||||||
|
"url": "/api/span-resolver"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "SpanRuler",
|
||||||
|
"url": "/api/spanruler"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Tagger",
|
||||||
|
"url": "/api/tagger"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "TextCategorizer",
|
||||||
|
"url": "/api/textcategorizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Tok2Vec",
|
||||||
|
"url": "/api/tok2vec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Tokenizer",
|
||||||
|
"url": "/api/tokenizer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "TrainablePipe",
|
||||||
|
"url": "/api/pipe"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Transformer",
|
||||||
|
"url": "/api/transformer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Other Functions",
|
||||||
|
"url": "/api/pipeline-functions"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Matchers",
|
"label": "Matchers",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
|
{
|
||||||
{ "text": "Matcher", "url": "/api/matcher" },
|
"text": "DependencyMatcher",
|
||||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" }
|
"url": "/api/dependencymatcher"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Matcher",
|
||||||
|
"url": "/api/matcher"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "PhraseMatcher",
|
||||||
|
"url": "/api/phrasematcher"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Other",
|
"label": "Other",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Attributes", "url": "/api/attributes" },
|
{
|
||||||
{ "text": "BaseVectors", "url": "/api/basevectors" },
|
"text": "Attributes",
|
||||||
{ "text": "Corpus", "url": "/api/corpus" },
|
"url": "/api/attributes"
|
||||||
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
|
},
|
||||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
{
|
||||||
{ "text": "Lookups", "url": "/api/lookups" },
|
"text": "BaseVectors",
|
||||||
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
"url": "/api/basevectors"
|
||||||
{ "text": "Morphology", "url": "/api/morphology" },
|
},
|
||||||
{ "text": "Scorer", "url": "/api/scorer" },
|
{
|
||||||
{ "text": "StringStore", "url": "/api/stringstore" },
|
"text": "Corpus",
|
||||||
{ "text": "Vectors", "url": "/api/vectors" },
|
"url": "/api/corpus"
|
||||||
{ "text": "Vocab", "url": "/api/vocab" }
|
},
|
||||||
|
{
|
||||||
|
"text": "InMemoryLookupKB",
|
||||||
|
"url": "/api/inmemorylookupkb"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "KnowledgeBase",
|
||||||
|
"url": "/api/kb"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Lookups",
|
||||||
|
"url": "/api/lookups"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "MorphAnalysis",
|
||||||
|
"url": "/api/morphology#morphanalysis"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Morphology",
|
||||||
|
"url": "/api/morphology"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Scorer",
|
||||||
|
"url": "/api/scorer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "StringStore",
|
||||||
|
"url": "/api/stringstore"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Vectors",
|
||||||
|
"url": "/api/vectors"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Vocab",
|
||||||
|
"url": "/api/vocab"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Cython",
|
"label": "Cython",
|
||||||
"items": [
|
"items": [
|
||||||
{ "text": "Architecture", "url": "/api/cython" },
|
{
|
||||||
{ "text": "Classes", "url": "/api/cython-classes" },
|
"text": "Architecture",
|
||||||
{ "text": "Structs", "url": "/api/cython-structs" }
|
"url": "/api/cython"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Classes",
|
||||||
|
"url": "/api/cython-classes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Structs",
|
||||||
|
"url": "/api/cython-structs"
|
||||||
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Legacy",
|
"label": "Legacy",
|
||||||
"items": [{ "text": "Legacy functions", "url": "/api/legacy" }]
|
"items": [
|
||||||
|
{
|
||||||
|
"text": "Legacy functions",
|
||||||
|
"url": "/api/legacy"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -276,6 +276,47 @@
|
||||||
"ancient Greek"
|
"ancient Greek"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "solipcysme",
|
||||||
|
"title": "solipCysme",
|
||||||
|
"slogan": "spaCy pipeline for french fictions and first person point of view texts.",
|
||||||
|
"description": "__solipCysme__ is a pipeline for french language, designed for the analysis of fictions and first person point of view texts, with a focus on personal pronouns.",
|
||||||
|
"github": "thjbdvlt/solipCysme",
|
||||||
|
"code_example": [
|
||||||
|
"pip install https://huggingface.co/thjbdvlt/fr_solipcysme/resolve/main/fr_solipcysme-any-py3-none-any.whl",
|
||||||
|
"",
|
||||||
|
"import spacy",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load('fr_solipcysme')",
|
||||||
|
"for i in nlp(",
|
||||||
|
"'la MACHINE à (b)rouiller le temps s'est peut-être déraillée..?'",
|
||||||
|
"):",
|
||||||
|
" print(",
|
||||||
|
" i, ",
|
||||||
|
" i.norm_, ",
|
||||||
|
" i.pos_, ",
|
||||||
|
" i.morph, ",
|
||||||
|
" i.lemma_, ",
|
||||||
|
" i.dep_, ",
|
||||||
|
" i._.tokentype,",
|
||||||
|
" i._.vv_pos,",
|
||||||
|
" i._.vv_morph",
|
||||||
|
" )"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "thjbdvlt",
|
||||||
|
"author_links": {
|
||||||
|
"github": "thjbdvlt"
|
||||||
|
},
|
||||||
|
"category": [
|
||||||
|
"pipeline",
|
||||||
|
"research",
|
||||||
|
"models"
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
"french"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-cleaner",
|
"id": "spacy-cleaner",
|
||||||
"title": "spacy-cleaner",
|
"title": "spacy-cleaner",
|
||||||
|
@ -1353,6 +1394,48 @@
|
||||||
"website": "https://ines.io"
|
"website": "https://ines.io"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "spacy-layout",
|
||||||
|
"slogan": "Process PDFs, Word documents and more with spaCy",
|
||||||
|
"github": "explosion/spacy-layout",
|
||||||
|
"description": "This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of PDFs, Word documents and other input formats to your spaCy pipeline. It outputs clean, structured data in a text-based format and outputs spaCy's familiar `Doc` objects that let you access labelled text spans like sections, headings, or footnotes.\n\nThis workflow makes it easy to apply powerful NLP techniques to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing chunking for RAG pipelines.",
|
||||||
|
"pip": "spacy-layout",
|
||||||
|
"category": [
|
||||||
|
"pipeline"
|
||||||
|
],
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"from spacy_layout import spaCyLayout",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.blank(\"en\")",
|
||||||
|
"layout = spaCyLayout(nlp)",
|
||||||
|
"",
|
||||||
|
"# Process a document and create a spaCy Doc object",
|
||||||
|
"doc = layout(\"./starcraft.pdf\")",
|
||||||
|
"",
|
||||||
|
"# The text-based contents of the document",
|
||||||
|
"print(doc.text)",
|
||||||
|
"# Document layout including pages and page sizes",
|
||||||
|
"print(doc._.layout)",
|
||||||
|
"",
|
||||||
|
"# Layout spans for different sections",
|
||||||
|
"for span in doc.spans[\"layout\"]:",
|
||||||
|
" # Document section and token and character offsets into the text",
|
||||||
|
" print(span.text, span.start, span.end, span.start_char, span.end_char)",
|
||||||
|
" # Section type, e.g. \"text\", \"title\", \"section_header\" etc.",
|
||||||
|
" print(span.label_)",
|
||||||
|
" # Layout features of the section, including bounding box",
|
||||||
|
" print(span._.layout)",
|
||||||
|
" # Closest heading to the span (accuracy depends on document structure)",
|
||||||
|
" print(span._.heading)"
|
||||||
|
],
|
||||||
|
"author": "Ines Montani",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "_inesmontani",
|
||||||
|
"github": "ines",
|
||||||
|
"website": "https://ines.io"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacyopentapioca",
|
"id": "spacyopentapioca",
|
||||||
"title": "spaCyOpenTapioca",
|
"title": "spaCyOpenTapioca",
|
||||||
|
@ -2587,6 +2670,20 @@
|
||||||
"courses"
|
"courses"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"type": "education",
|
||||||
|
"id": "spacy-quickstart",
|
||||||
|
"title": "spaCy Quickstart",
|
||||||
|
"slogan": "Learn spaCy basics quickly by visualizing various Doc objects",
|
||||||
|
"description": "In this course, I use the itables Python library inside a Jupyter notebook so that you can visualize the different spaCy document objects. This will provide a solid foundation for people who wish to learn the spaCy NLP library.",
|
||||||
|
"url": "https://learnspacy.com/courses/spacy-quickstart/",
|
||||||
|
"image": "https://learnspacy.com/wp-content/uploads/2024/09/custom_search_builder_spacy-2048x1202.png",
|
||||||
|
"thumb": "https://learnspacy.com/wp-content/uploads/2024/09/learnspacy_logo.png",
|
||||||
|
"author": "Aravind Mohanoor",
|
||||||
|
"category": [
|
||||||
|
"courses"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"type": "education",
|
"type": "education",
|
||||||
"id": "video-spacys-ner-model",
|
"id": "video-spacys-ner-model",
|
||||||
|
|
|
@ -87,6 +87,9 @@
|
||||||
margin-bottom: 0
|
margin-bottom: 0
|
||||||
height: 100%
|
height: 100%
|
||||||
|
|
||||||
|
a, a:hover
|
||||||
|
color: inherit
|
||||||
|
|
||||||
.banner-content-small
|
.banner-content-small
|
||||||
display: block
|
display: block
|
||||||
margin-bottom: 0 !important
|
margin-bottom: 0 !important
|
||||||
|
|
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
|
<Link to="https://github.com/explosion/spacy-layout" noLinkLayout>
|
||||||
💥 <strong>New:</strong> Case study with S&P Global
|
💥 <strong>New:</strong> spaCy for PDFs and Word docs
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user