Add spacy-layout [ci skip]

This commit is contained in:
Ines Montani 2024-11-19 10:43:40 +01:00
parent 7a7f191220
commit 0b43518611
2 changed files with 44 additions and 2 deletions

View File

@ -1394,6 +1394,48 @@
"website": "https://ines.io"
}
},
{
"id": "spacy-layout",
"slogan": "Process PDFs, Word documents and more with spaCy",
"github": "explosion/spacy-layout",
"description": "This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of PDFs, Word documents and other input formats to your spaCy pipeline. It outputs clean, structured data in a text-based format and outputs spaCy's familiar `Doc` objects that let you access labelled text spans like sections, headings, or footnotes.\n\nThis workflow makes it easy to apply powerful NLP techniques to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing chunking for RAG pipelines.",
"pip": "spacy-layout",
"category": [
"pipeline"
],
"code_example": [
"import spacy",
"from spacy_layout import spaCyLayout",
"",
"nlp = spacy.blank(\"en\")",
"layout = spaCyLayout(nlp)",
"",
"# Process a document and create a spaCy Doc object",
"doc = layout(\"./starcraft.pdf\")",
"",
"# The text-based contents of the document",
"print(doc.text)",
"# Document layout including pages and page sizes",
"print(doc._.layout)",
"",
"# Layout spans for different sections",
"for span in doc.spans[\"layout\"]:",
" # Document section and token and character offsets into the text",
" print(span.text, span.start, span.end, span.start_char, span.end_char)",
" # Section type, e.g. \"text\", \"title\", \"section_header\" etc.",
" print(span.label_)",
" # Layout features of the section, including bounding box",
" print(span._.layout)",
" # Closest heading to the span (accuracy depends on document structure)",
" print(span._.heading)"
],
"author": "Ines Montani",
"author_links": {
"twitter": "_inesmontani",
"github": "ines",
"website": "https://ines.io"
}
},
{
"id": "spacyopentapioca",
"title": "spaCyOpenTapioca",

View File

@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
💥 <strong>New:</strong> Case study with S&P Global
<Link to="https://github.com/explosion/spacy-layout" noLinkLayout>
💥 <strong>New:</strong> spaCy for PDFs and Word docs
</Link>
)