Added spacypdfreader to universe.json (#9963)

This commit is contained in:
Sam Edwardes 2022-01-02 23:34:36 -08:00 committed by GitHub
parent f40e237c5a
commit 6f65e2b544
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,43 @@
{
"resources": [
{
"id": "spacypdfreader",
"title": "spadypdfreader",
"category": ["pipeline"],
"tags": ["PDF"],
"slogan": "Easy PDF to text to spaCy text extraction in Python.",
"description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
"github": "SamEdwardes/spacypdfreader",
"pip": "spacypdfreader",
"url": "https://samedwardes.github.io/spacypdfreader/",
"code_language": "python",
"author": "Sam Edwardes",
"author_links": {
"twitter": "TheReaLSamlam",
"github": "SamEdwardes",
"website": "https://samedwardes.com"
},
"code_example": [
"import spacy",
"from spacypdfreader import pdf_reader",
"",
"nlp = spacy.load('en_core_web_sm')",
"doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
"",
"# Get the page number of any token.",
"print(doc[0]._.page_number) # 1",
"print(doc[-1]._.page_number) # 4",
"",
"# Get page meta data about the PDF document.",
"print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'",
"print(doc._.page_range) # (1, 4)",
"print(doc._.first_page) # 1",
"print(doc._.last_page) # 4",
"",
"# Get all of the text from a specific PDF page.",
"print(doc._.page(4)) # 'able to display the destination page (unless...'"
]
},
{
"id": "nlpcloud",
"title": "NLPCloud.io",