From 83b6f488cb692e83e4f40d0d09180b3af85da75d Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Mon, 15 May 2023 14:09:33 +0200 Subject: [PATCH] universe: Update examples Adept Augementation (#12620) * Update universe.json * chore: changed readme example as suggested by Vincent Warmerdam (koaning) --- website/meta/universe.json | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 33185ca30..d4da65c00 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2865,33 +2865,35 @@ "title": "Adept Augmentations", "slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.", "description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".", - "github": "davidberenstein1957/adept-augmentations", + "github": "argilla-io/adept-augmentations", "pip": "adept-augmentations", - "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png", + "thumb": "https://raw.githubusercontent.com/argilla-io/adept-augmentations/main/logo.png", "code_example": [ - "import spacy", - "from spacy.tokens import DocBin", - "", "from adept_augmentations import EntitySwapAugmenter", + "import spacy", + "from spacy.tokens import Doc, DocBin", + "nlp = spacy.blank(\"en\")", "", - "nlp = spacy.load(\"en_core_web_sm\")", - "", - "TRAIN_DATA = [", - " \"Apple is looking at buying U.K. startup for $1 billion\",", - " \"Microsoft acquires GitHub for $7.5 billion\"", + "# Create some example golden data", + "example_data = [", + " (\"Apple is looking at buying U.K. startup for $1 billion\", [(0, 5, \"ORG\"), (27, 31, \"LOC\"), (44, 54, \"MONEY\")]),", + " (\"Microsoft acquires GitHub for $7.5 billion\", [(0, 9, \"ORG\"), (19, 25, \"ORG\"), (30, 42, \"MONEY\")]),", "]", - "docs = nlp.pipe(TRAIN_DATA)", "", "# Create a new DocBin", - "doc_bin = DocBin(docs=docs)", + "nlp = spacy.blank(\"en\")", + "docs = []", + "for entry in example_data:", + " doc = Doc(nlp.vocab, words=entry[0].split())", + " doc.ents = [doc.char_span(ent[0], ent[1], label=ent[2]) for ent in entry[1]]", + " docs.append(doc)", + "golden_dataset = DocBin(docs=docs)", "", "# Augment Data", - "doc_bin = EntitySwapAugmenter(doc_bin).augment(4)", - "for doc in doc_bin.get_docs(nlp.vocab):", + "augmented_dataset = EntitySwapAugmenter(golden_dataset).augment(4)", + "for doc in augmented_dataset.get_docs(nlp.vocab):", " print(doc.text)", "", - "# Output", - "#", "# GitHub is looking at buying U.K. startup for $ 7.5 billion", "# Microsoft is looking at buying U.K. startup for $ 1 billion", "# Microsoft is looking at buying U.K. startup for $ 7.5 billion",