mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	adding skweak to the SpaCy universe
This commit is contained in:
		
							parent
							
								
									6017fcf693
								
							
						
					
					
						commit
						debfb46088
					
				| 
						 | 
					@ -1,5 +1,58 @@
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    "resources": [
 | 
					    "resources": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "id": "skweak",
 | 
				
			||||||
 | 
					            "title": "skweak",
 | 
				
			||||||
 | 
					            "slogan": "Weak supervision for NLP",
 | 
				
			||||||
 | 
					            "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
 | 
				
			||||||
 | 
					            "github": "https://github.com/NorskRegnesentral/skweak",
 | 
				
			||||||
 | 
					            "pip": "skweak",
 | 
				
			||||||
 | 
					            "code_example": [
 | 
				
			||||||
 | 
					                "import spacy, re",
 | 
				
			||||||
 | 
					                "from skweak import heuristics, gazetteers, aggregation, utils",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# LF 1: heuristic to detect occurrences of MONEY entities",
 | 
				
			||||||
 | 
					                "def money_detector(doc):",
 | 
				
			||||||
 | 
					                "   for tok in doc[1:]:",
 | 
				
			||||||
 | 
					                "      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
 | 
				
			||||||
 | 
					                "          yield tok.i-1, tok.i+1, 'MONEY'",
 | 
				
			||||||
 | 
					                "lf1 = heuristics.FunctionAnnotator('money', money_detector)",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# LF 2: detection of years with a regex",
 | 
				
			||||||
 | 
					                "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# LF 3: a gazetteer with a few names",
 | 
				
			||||||
 | 
					                "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
 | 
				
			||||||
 | 
					                "trie = gazetteers.Trie(NAMES)",
 | 
				
			||||||
 | 
					                "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# We create a corpus (here with a single text)",
 | 
				
			||||||
 | 
					                "nlp = spacy.load('en_core_web_sm')",
 | 
				
			||||||
 | 
					                "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# apply the labelling functions",
 | 
				
			||||||
 | 
					                "doc = lf3(lf2(lf1(doc)))",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# and aggregate them",
 | 
				
			||||||
 | 
					                "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
 | 
				
			||||||
 | 
					                "hmm.fit_and_aggregate([doc])",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "# we can then visualise the final result (in Jupyter)",
 | 
				
			||||||
 | 
					                "utils.display_entities(doc, 'hmm')"
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					            "code_language": "python",
 | 
				
			||||||
 | 
					            "url": "https://github.com/NorskRegnesentral/skweak",
 | 
				
			||||||
 | 
					            "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
 | 
				
			||||||
 | 
					            "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
 | 
				
			||||||
 | 
					            "author": "Pierre Lison",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "plison2",
 | 
				
			||||||
 | 
					                "github": "plison",
 | 
				
			||||||
 | 
					                "website": "https://www.nr.no/~plison"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "category": ["pipeline", "standalone", "research", "training"],
 | 
				
			||||||
 | 
					            "tags": []
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
	    "id": "numerizer",
 | 
						    "id": "numerizer",
 | 
				
			||||||
	    "title": "numerizer",
 | 
						    "title": "numerizer",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user