universe: Update examples Adept Augementation (#12620)

* Update universe.json * chore: changed readme example as suggested by Vincent Warmerdam (koaning)
2025-08-08 06:04:57 +03:00 · 2023-05-15 14:09:33 +02:00 · 2023-05-15 14:09:33 +02:00 · 83b6f488cb
commit 83b6f488cb
parent 3dc445df8d
1 changed files with 18 additions and 16 deletions
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -2865,33 +2865,35 @@
            "title": "Adept Augmentations",
            "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
            "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
-            "github": "davidberenstein1957/adept-augmentations",
+            "github": "argilla-io/adept-augmentations",
            "pip": "adept-augmentations",
-            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
+            "thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png",
            "code_example": [
-                "import spacy",
-                "from spacy.tokens import DocBin",
-                "",
                "from adept_augmentations import EntitySwapAugmenter",
+                "import spacy",
+                "from spacy.tokens import Doc, DocBin",
+                "nlp = spacy.blank(\"en\")",
                "",
-                "nlp = spacy.load(\"en_core_web_sm\")",
-                "",
-                "TRAIN_DATA = [",
-                "    \"Apple is looking at buying U.K. startup for $1 billion\",",
-                "    \"Microsoft acquires GitHub for $7.5 billion\"",
+                "# Create some example golden data",
+                "example_data = [",
+                "    (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),",
+                "    (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),",
                "]",
-                "docs = nlp.pipe(TRAIN_DATA)",
                "",
                "# Create a new DocBin",
-                "doc_bin = DocBin(docs=docs)",
+                "nlp = spacy.blank(\"en\")",
+                "docs = []",
+                "for entry in example_data:",
+                "    doc = Doc(nlp.vocab, words=entry[0].split())",
+                "    doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]",
+                "    docs.append(doc)",
+                "golden_dataset = DocBin(docs=docs)",
                "",
                "# Augment Data",
-                "doc_bin = EntitySwapAugmenter(doc_bin).augment(4)",
-                "for doc in doc_bin.get_docs(nlp.vocab):",
+                "augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)",
+                "for doc in augmented_dataset.get_docs(nlp.vocab):",
                "    print(doc.text)",
                "",
-                "# Output",
-                "#",
                "# GitHub is looking at buying U.K. startup for $ 7.5 billion",
                "# Microsoft is looking at buying U.K. startup for $ 1 billion",
                "# Microsoft is looking at buying U.K. startup for $ 7.5 billion",