From debfb46088ee7eb53a02f80f0d5fa0ea7a8ee5bf Mon Sep 17 00:00:00 2001
From: Pierre Lison <plison@nr.no>
Date: Thu, 22 Apr 2021 00:58:09 +0200
Subject: [PATCH] adding skweak to the SpaCy universe

---
 website/meta/universe.json | 55 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 01aa058b5..7b13e9ac2 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,58 @@
 {
     "resources": [
+        {
+            "id": "skweak",
+            "title": "skweak",
+            "slogan": "Weak supervision for NLP",
+            "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
+            "github": "https://github.com/NorskRegnesentral/skweak",
+            "pip": "skweak",
+            "code_example": [
+                "import spacy, re",
+                "from skweak import heuristics, gazetteers, aggregation, utils",
+                "",
+                "# LF 1: heuristic to detect occurrences of MONEY entities",
+                "def money_detector(doc):",
+                "   for tok in doc[1:]:",
+                "      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
+                "          yield tok.i-1, tok.i+1, 'MONEY'",
+                "lf1 = heuristics.FunctionAnnotator('money', money_detector)",
+                "",
+                "# LF 2: detection of years with a regex",
+                "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
+                "",
+                "# LF 3: a gazetteer with a few names",
+                "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
+                "trie = gazetteers.Trie(NAMES)",
+                "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
+                "",
+                "# We create a corpus (here with a single text)",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
+                "",
+                "# apply the labelling functions",
+                "doc = lf3(lf2(lf1(doc)))",
+                "",
+                "# and aggregate them",
+                "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
+                "hmm.fit_and_aggregate([doc])",
+                "",
+                "# we can then visualise the final result (in Jupyter)",
+                "utils.display_entities(doc, 'hmm')"
+            ],
+            "code_language": "python",
+            "url": "https://github.com/NorskRegnesentral/skweak",
+            "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
+            "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
+            "author": "Pierre Lison",
+            "author_links": {
+                "twitter": "plison2",
+                "github": "plison",
+                "website": "https://www.nr.no/~plison"
+            },
+            "category": ["pipeline", "standalone", "research", "training"],
+            "tags": []
+        },
 	{
 	    "id": "numerizer",
 	    "title": "numerizer",
@@ -3002,4 +3055,4 @@
             ]
         }
     ]
-}
+}
\ No newline at end of file