mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	adding skweak to the SpaCy universe
This commit is contained in:
		
							parent
							
								
									1c1087e4ff
								
							
						
					
					
						commit
						bb961a2c11
					
				| 
						 | 
				
			
			@ -1,5 +1,58 @@
 | 
			
		|||
{
 | 
			
		||||
    "resources": [
 | 
			
		||||
        {
 | 
			
		||||
            "id": "skweak",
 | 
			
		||||
            "title": "skweak",
 | 
			
		||||
            "slogan": "Weak supervision for NLP",
 | 
			
		||||
            "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
 | 
			
		||||
            "github": "https://github.com/NorskRegnesentral/skweak",
 | 
			
		||||
            "pip": "skweak",
 | 
			
		||||
            "code_example": [
 | 
			
		||||
                "import spacy, re",
 | 
			
		||||
                "from skweak import heuristics, gazetteers, aggregation, utils",
 | 
			
		||||
                "",
 | 
			
		||||
                "# LF 1: heuristic to detect occurrences of MONEY entities",
 | 
			
		||||
                "def money_detector(doc):",
 | 
			
		||||
                "   for tok in doc[1:]:",
 | 
			
		||||
                "      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
 | 
			
		||||
                "          yield tok.i-1, tok.i+1, 'MONEY'",
 | 
			
		||||
                "lf1 = heuristics.FunctionAnnotator('money', money_detector)",
 | 
			
		||||
                "",
 | 
			
		||||
                "# LF 2: detection of years with a regex",
 | 
			
		||||
                "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
 | 
			
		||||
                "",
 | 
			
		||||
                "# LF 3: a gazetteer with a few names",
 | 
			
		||||
                "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
 | 
			
		||||
                "trie = gazetteers.Trie(NAMES)",
 | 
			
		||||
                "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
 | 
			
		||||
                "",
 | 
			
		||||
                "# We create a corpus (here with a single text)",
 | 
			
		||||
                "nlp = spacy.load('en_core_web_sm')",
 | 
			
		||||
                "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
 | 
			
		||||
                "",
 | 
			
		||||
                "# apply the labelling functions",
 | 
			
		||||
                "doc = lf3(lf2(lf1(doc)))",
 | 
			
		||||
                "",
 | 
			
		||||
                "# and aggregate them",
 | 
			
		||||
                "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
 | 
			
		||||
                "hmm.fit_and_aggregate([doc])",
 | 
			
		||||
                "",
 | 
			
		||||
                "# we can then visualise the final result (in Jupyter)",
 | 
			
		||||
                "utils.display_entities(doc, 'hmm')"
 | 
			
		||||
            ],
 | 
			
		||||
            "code_language": "python",
 | 
			
		||||
            "url": "https://github.com/NorskRegnesentral/skweak",
 | 
			
		||||
            "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
 | 
			
		||||
            "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
 | 
			
		||||
            "author": "Pierre Lison",
 | 
			
		||||
            "author_links": {
 | 
			
		||||
                "twitter": "plison2",
 | 
			
		||||
                "github": "plison",
 | 
			
		||||
                "website": "https://www.nr.no/~plison"
 | 
			
		||||
            },
 | 
			
		||||
            "category": ["pipeline", "standalone", "research", "training"],
 | 
			
		||||
            "tags": []
 | 
			
		||||
        },
 | 
			
		||||
	{
 | 
			
		||||
	    "id": "numerizer",
 | 
			
		||||
	    "title": "numerizer",
 | 
			
		||||
| 
						 | 
				
			
			@ -3002,4 +3055,4 @@
 | 
			
		|||
            ]
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user