adding skweak to the SpaCy universe

2025-11-04 01:48:04 +03:00 · 2021-04-22 00:58:09 +02:00 · 2021-04-22 00:58:09 +02:00 · debfb46088
commit debfb46088
parent 6017fcf693
1 changed files with 54 additions and 1 deletions
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,58 @@
 {
    "resources": [
        {
            "id": "skweak",
            "title": "skweak",
            "slogan": "Weak supervision for NLP",
            "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
            "github": "https://github.com/NorskRegnesentral/skweak",
            "pip": "skweak",
            "code_example": [
                "import spacy, re",
                "from skweak import heuristics, gazetteers, aggregation, utils",
                "",
                "# LF 1: heuristic to detect occurrences of MONEY entities",
                "def money_detector(doc):",
                "   for tok in doc[1:]:",
                "      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
                "          yield tok.i-1, tok.i+1, 'MONEY'",
                "lf1 = heuristics.FunctionAnnotator('money', money_detector)",
                "",
                "# LF 2: detection of years with a regex",
                "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
                "",
                "# LF 3: a gazetteer with a few names",
                "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
                "trie = gazetteers.Trie(NAMES)",
                "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
                "",
                "# We create a corpus (here with a single text)",
                "nlp = spacy.load('en_core_web_sm')",
                "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
                "",
                "# apply the labelling functions",
                "doc = lf3(lf2(lf1(doc)))",
                "",
                "# and aggregate them",
                "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
                "hmm.fit_and_aggregate([doc])",
                "",
                "# we can then visualise the final result (in Jupyter)",
                "utils.display_entities(doc, 'hmm')"
            ],
            "code_language": "python",
            "url": "https://github.com/NorskRegnesentral/skweak",
            "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
            "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
            "author": "Pierre Lison",
            "author_links": {
                "twitter": "plison2",
                "github": "plison",
                "website": "https://www.nr.no/~plison"
            },
            "category": ["pipeline", "standalone", "research", "training"],
            "tags": []
        },
 	{
 	    "id": "numerizer",
 	    "title": "numerizer",