Added spacypdfreader to universe.json (#9963)

2025-08-02 03:10:22 +03:00 · 2022-01-02 23:34:36 -08:00 · 2022-01-02 23:34:36 -08:00 · 6f65e2b544
commit 6f65e2b544
parent f40e237c5a
1 changed files with 38 additions and 0 deletions
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,43 @@
 {
    "resources": [
+        {
+            "id": "spacypdfreader",
+            "title": "spadypdfreader",
+            "category": ["pipeline"],
+            "tags": ["PDF"],
+            "slogan": "Easy PDF to text to spaCy text extraction in Python.",
+            "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
+            "github": "SamEdwardes/spacypdfreader",
+            "pip": "spacypdfreader",
+            "url": "https://samedwardes.github.io/spacypdfreader/",
+            "code_language": "python",
+            "author": "Sam Edwardes",
+            "author_links": {
+                "twitter": "TheReaLSamlam",
+                "github": "SamEdwardes",
+                "website": "https://samedwardes.com"
+            },
+            "code_example": [
+                "import spacy",
+                "from spacypdfreader import pdf_reader",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
+                "",
+                "# Get the page number of any token.",
+                "print(doc[0]._.page_number)  # 1",
+                "print(doc[-1]._.page_number) # 4",
+                "",
+                "# Get page meta data about the PDF document.",
+                "print(doc._.pdf_file_name)   # 'tests/data/test_pdf_01.pdf'",
+                "print(doc._.page_range)      # (1, 4)",
+                "print(doc._.first_page)      # 1",
+                "print(doc._.last_page)       # 4",
+                "",
+                "# Get all of the text from a specific PDF page.",
+                "print(doc._.page(4))         # 'able to display the destination page (unless...'"
+            ]
+        },
        {
            "id": "nlpcloud",
            "title": "NLPCloud.io",