mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
adding skweak to the SpaCy universe
This commit is contained in:
parent
1c1087e4ff
commit
bb961a2c11
|
@ -1,5 +1,58 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "skweak",
|
||||||
|
"title": "skweak",
|
||||||
|
"slogan": "Weak supervision for NLP",
|
||||||
|
"description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
|
||||||
|
"github": "https://github.com/NorskRegnesentral/skweak",
|
||||||
|
"pip": "skweak",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy, re",
|
||||||
|
"from skweak import heuristics, gazetteers, aggregation, utils",
|
||||||
|
"",
|
||||||
|
"# LF 1: heuristic to detect occurrences of MONEY entities",
|
||||||
|
"def money_detector(doc):",
|
||||||
|
" for tok in doc[1:]:",
|
||||||
|
" if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
|
||||||
|
" yield tok.i-1, tok.i+1, 'MONEY'",
|
||||||
|
"lf1 = heuristics.FunctionAnnotator('money', money_detector)",
|
||||||
|
"",
|
||||||
|
"# LF 2: detection of years with a regex",
|
||||||
|
"lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
|
||||||
|
"",
|
||||||
|
"# LF 3: a gazetteer with a few names",
|
||||||
|
"NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
|
||||||
|
"trie = gazetteers.Trie(NAMES)",
|
||||||
|
"lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
|
||||||
|
"",
|
||||||
|
"# We create a corpus (here with a single text)",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
|
||||||
|
"",
|
||||||
|
"# apply the labelling functions",
|
||||||
|
"doc = lf3(lf2(lf1(doc)))",
|
||||||
|
"",
|
||||||
|
"# and aggregate them",
|
||||||
|
"hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
|
||||||
|
"hmm.fit_and_aggregate([doc])",
|
||||||
|
"",
|
||||||
|
"# we can then visualise the final result (in Jupyter)",
|
||||||
|
"utils.display_entities(doc, 'hmm')"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"url": "https://github.com/NorskRegnesentral/skweak",
|
||||||
|
"thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
|
||||||
|
"image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
|
||||||
|
"author": "Pierre Lison",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "plison2",
|
||||||
|
"github": "plison",
|
||||||
|
"website": "https://www.nr.no/~plison"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "standalone", "research", "training"],
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "numerizer",
|
"id": "numerizer",
|
||||||
"title": "numerizer",
|
"title": "numerizer",
|
||||||
|
@ -3002,4 +3055,4 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user