adding skweak to the SpaCy universe

This commit is contained in:
Pierre Lison 2021-04-22 00:58:09 +02:00 committed by Ines Montani
parent 1c1087e4ff
commit bb961a2c11

View File

@ -1,5 +1,58 @@
{
"resources": [
{
"id": "skweak",
"title": "skweak",
"slogan": "Weak supervision for NLP",
"description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
"github": "https://github.com/NorskRegnesentral/skweak",
"pip": "skweak",
"code_example": [
"import spacy, re",
"from skweak import heuristics, gazetteers, aggregation, utils",
"",
"# LF 1: heuristic to detect occurrences of MONEY entities",
"def money_detector(doc):",
" for tok in doc[1:]:",
" if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
" yield tok.i-1, tok.i+1, 'MONEY'",
"lf1 = heuristics.FunctionAnnotator('money', money_detector)",
"",
"# LF 2: detection of years with a regex",
"lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
"",
"# LF 3: a gazetteer with a few names",
"NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
"trie = gazetteers.Trie(NAMES)",
"lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
"",
"# We create a corpus (here with a single text)",
"nlp = spacy.load('en_core_web_sm')",
"doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
"",
"# apply the labelling functions",
"doc = lf3(lf2(lf1(doc)))",
"",
"# and aggregate them",
"hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
"hmm.fit_and_aggregate([doc])",
"",
"# we can then visualise the final result (in Jupyter)",
"utils.display_entities(doc, 'hmm')"
],
"code_language": "python",
"url": "https://github.com/NorskRegnesentral/skweak",
"thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
"image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
"author": "Pierre Lison",
"author_links": {
"twitter": "plison2",
"github": "plison",
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
"tags": []
},
{
"id": "numerizer",
"title": "numerizer",
@ -3002,4 +3055,4 @@
]
}
]
}
}