From debfb46088ee7eb53a02f80f0d5fa0ea7a8ee5bf Mon Sep 17 00:00:00 2001 From: Pierre Lison Date: Thu, 22 Apr 2021 00:58:09 +0200 Subject: [PATCH] adding skweak to the SpaCy universe --- website/meta/universe.json | 55 +++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 01aa058b5..7b13e9ac2 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,58 @@ { "resources": [ + { + "id": "skweak", + "title": "skweak", + "slogan": "Weak supervision for NLP", + "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", + "github": "https://github.com/NorskRegnesentral/skweak", + "pip": "skweak", + "code_example": [ + "import spacy, re", + "from skweak import heuristics, gazetteers, aggregation, utils", + "", + "# LF 1: heuristic to detect occurrences of MONEY entities", + "def money_detector(doc):", + " for tok in doc[1:]:", + " if tok.text[0].isdigit() and tok.nbor(-1).is_currency:", + " yield tok.i-1, tok.i+1, 'MONEY'", + "lf1 = heuristics.FunctionAnnotator('money', money_detector)", + "", + "# LF 2: detection of years with a regex", + "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')", + "", + "# LF 3: a gazetteer with a few names", + "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]", + "trie = gazetteers.Trie(NAMES)", + "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})", + "", + "# We create a corpus (here with a single text)", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')", + "", + "# apply the labelling functions", + "doc = lf3(lf2(lf1(doc)))", + "", + "# and aggregate them", + "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])", + "hmm.fit_and_aggregate([doc])", + "", + "# we can then visualise the final result (in Jupyter)", + "utils.display_entities(doc, 'hmm')" + ], + "code_language": "python", + "url": "https://github.com/NorskRegnesentral/skweak", + "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg", + "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg", + "author": "Pierre Lison", + "author_links": { + "twitter": "plison2", + "github": "plison", + "website": "https://www.nr.no/~plison" + }, + "category": ["pipeline", "standalone", "research", "training"], + "tags": [] + }, { "id": "numerizer", "title": "numerizer", @@ -3002,4 +3055,4 @@ ] } ] -} +} \ No newline at end of file