mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Basic punct tests updated and passing
This commit is contained in:
		
							parent
							
								
									8d20617dfd
								
							
						
					
					
						commit
						fdaf24604a
					
				|  | @ -1,14 +0,0 @@ | |||
| from .lexeme import lex_of | ||||
| from .lexeme import length_of | ||||
| 
 | ||||
| from .tokens import Tokens | ||||
| 
 | ||||
| # Don't know how to get the enum Python visible :( | ||||
| 
 | ||||
| LEX = 0 | ||||
| NORM = 1 | ||||
| SHAPE = 2 | ||||
| LAST3 = 3 | ||||
| LENGTH = 4 | ||||
| 
 | ||||
| __all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH] | ||||
							
								
								
									
										37
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -3,42 +3,5 @@ from spacy.word cimport Lexeme | |||
| cimport cython | ||||
| 
 | ||||
| 
 | ||||
| cpdef size_t ALPHA | ||||
| cpdef size_t DIGIT  | ||||
| cpdef size_t PUNCT | ||||
| cpdef size_t SPACE | ||||
| cpdef size_t LOWER | ||||
| cpdef size_t UPPER | ||||
| cpdef size_t TITLE | ||||
| cpdef size_t ASCII | ||||
| 
 | ||||
| cpdef size_t OFT_LOWER | ||||
| cpdef size_t OFT_TITLE | ||||
| cpdef size_t OFT_UPPER | ||||
| 
 | ||||
| cpdef size_t PUNCT | ||||
| cpdef size_t CONJ | ||||
| cpdef size_t NUM | ||||
| cpdef size_t N | ||||
| cpdef size_t DET | ||||
| cpdef size_t ADP | ||||
| cpdef size_t ADJ | ||||
| cpdef size_t ADV | ||||
| cpdef size_t VERB | ||||
| cpdef size_t NOUN | ||||
| cpdef size_t PDT | ||||
| cpdef size_t POS | ||||
| cpdef size_t PRON | ||||
| cpdef size_t PRT | ||||
| 
 | ||||
| cpdef size_t SIC | ||||
| cpdef size_t CANON_CASED | ||||
| cpdef size_t SHAPE | ||||
| cpdef size_t NON_SPARSE | ||||
| 
 | ||||
| 
 | ||||
| cdef class English(Language): | ||||
|     cpdef int _split_one(self, unicode word) | ||||
| 
 | ||||
| 
 | ||||
| cpdef English EN | ||||
|  |  | |||
|  | @ -84,10 +84,10 @@ EN = English('en') | |||
| 
 | ||||
| 
 | ||||
| # Thresholds for frequency related flags | ||||
| TAG_THRESH = 0.5 | ||||
| LOWER_THRESH = 0.5 | ||||
| UPPER_THRESH = 0.3 | ||||
| TITLE_THRESH = 0.9 | ||||
| cdef double TAG_THRESH = 0.5 | ||||
| cdef double LOWER_THRESH = 0.5 | ||||
| cdef double UPPER_THRESH = 0.3 | ||||
| cdef double TITLE_THRESH = 0.9 | ||||
| 
 | ||||
| 
 | ||||
| # Python-readable flag constants --- can't read an enum from Python | ||||
|  |  | |||
|  | @ -4,6 +4,10 @@ from spacy.word cimport Lexeme | |||
| 
 | ||||
| 
 | ||||
| cdef class Lexicon: | ||||
|     cdef public dict probs | ||||
|     cdef public dict clusters | ||||
|     cdef public dict case_stats | ||||
|     cdef public dict tag_stats | ||||
|     cdef public list flag_checkers | ||||
|     cdef public list string_transformers | ||||
| 
 | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ cdef class Language: | |||
|         self.name = name | ||||
|         self.cache = {} | ||||
|         self.lexicon = Lexicon() | ||||
|         self.load_tokenization(util.read_tokenization(name)) | ||||
|         #self.load_special_tokenization(util.read_tokenization(name)) | ||||
| 
 | ||||
|     cpdef list tokenize(self, unicode string): | ||||
|         """Tokenize a string. | ||||
|  | @ -57,7 +57,7 @@ cdef class Language: | |||
|         cdef list lexemes = [] | ||||
|         substrings = self._split(string) | ||||
|         for i, substring in enumerate(substrings): | ||||
|             lexemes.append(self.lookup(substring)) | ||||
|             lexemes.append(self.lexicon.lookup(substring)) | ||||
|         self.cache[string] = lexemes | ||||
|         return lexemes | ||||
| 
 | ||||
|  | @ -108,7 +108,11 @@ cdef class Language: | |||
| cdef class Lexicon: | ||||
|     def __cinit__(self): | ||||
|         self.flag_checkers = [] | ||||
|         self.string_transforms = [] | ||||
|         self.string_transformers = [] | ||||
|         self.probs = {} | ||||
|         self.clusters = {} | ||||
|         self.case_stats = {} | ||||
|         self.tag_stats = {} | ||||
|         self.lexicon = {} | ||||
| 
 | ||||
|     cpdef Lexeme lookup(self, unicode string): | ||||
|  | @ -151,6 +155,7 @@ cdef class Lexicon: | |||
|     def load_probs(self, location): | ||||
|         """Load unigram probabilities. | ||||
|         """ | ||||
|         # Dict mapping words to floats | ||||
|         self.probs = json.load(location) | ||||
|          | ||||
|         cdef Lexeme word | ||||
|  | @ -161,18 +166,21 @@ cdef class Lexicon: | |||
|             word.prob = prob | ||||
| 
 | ||||
|     def load_clusters(self, location): | ||||
|         self.probs = json.load(location) | ||||
|         # TODO: Find out endianness | ||||
|         # Dict mapping words to ??-endian ints | ||||
|         self.clusters = json.load(location) | ||||
|          | ||||
|         cdef Lexeme word | ||||
|         cdef unicode string | ||||
| 
 | ||||
|         for string, word in self.lexicon.items(): | ||||
|             cluster = _pop_default(self.cluster, string, 0) | ||||
|             cluster = _pop_default(self.clusters, string, 0) | ||||
|             word.cluster = cluster | ||||
| 
 | ||||
|     def load_stats(self, location): | ||||
|         """Load distributional stats. | ||||
|         """ | ||||
|         # Dict mapping string to dict of arbitrary stuff. | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,7 +12,7 @@ cdef class Lexeme: | |||
|     cpdef readonly double prob | ||||
|     cpdef readonly size_t cluster | ||||
| 
 | ||||
|     cdef utf8_t* views | ||||
|     cdef list views | ||||
|     cdef size_t nr_views | ||||
| 
 | ||||
|     cdef readonly flag_t flags | ||||
|  |  | |||
|  | @ -49,35 +49,41 @@ cdef class Lexeme: | |||
|             while "dapple" is totally different. On the other hand, "scalable" receives | ||||
|             the same cluster ID as "pineapple", which is not what we'd like. | ||||
|     """ | ||||
|     def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, | ||||
|                   flags=0): | ||||
|         self.id = <id_t>&string | ||||
|         self.length = length | ||||
|         self.nr_strings = 0 | ||||
|         self.add_views(views) | ||||
|     def __cinit__(self, unicode string, prob, cluster, case_stats, | ||||
|                   tag_stats, flag_checkers, string_transformers): | ||||
|         self.prob = prob | ||||
|         self.cluster = cluster | ||||
|         self.length = len(string) | ||||
|         self.id = hash(string) | ||||
| 
 | ||||
|         self.nr_views = len(string_transformers) | ||||
|         self.views = [] | ||||
|         cdef unicode view | ||||
|         for i, string_transformer in enumerate(string_transformers): | ||||
|             view = string_transformer(string, prob, case_stats, tag_stats) | ||||
|             self.views.append(view) | ||||
| 
 | ||||
|         for i, flag_checker in enumerate(flag_checkers): | ||||
|             if flag_checker(string, prob, case_stats, tag_stats): | ||||
|                 self.set_flag(i) | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         free(self.views) | ||||
|         pass | ||||
| 
 | ||||
|     property string: | ||||
|         def __get__(self): | ||||
|             return self.strings[0].decode('utf8') | ||||
|             return self.views[0] | ||||
| 
 | ||||
|     cpdef unicode get_view_string(self, size_t i): | ||||
|         assert i < self.nr_strings | ||||
|         return self.strings[i].decode('utf8') | ||||
|         assert i < self.nr_views | ||||
|         return self.views[i] | ||||
| 
 | ||||
|     cpdef id_t get_view_id(self, size_t i) except 0: | ||||
|         assert i < self.nr_strings | ||||
|         return <id_t>&self.views[i] | ||||
|         return <id_t>hash(self.views[i]) | ||||
| 
 | ||||
|     cpdef int add_view(self, unicode view) except -1: | ||||
|         self.nr_views += 1 | ||||
|         self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t)) | ||||
|         cdef bytes utf8_string = view.encode('utf8') | ||||
|         # Intern strings, allowing pointer comparison | ||||
|         utf8_string = intern(utf8_string) | ||||
|         self.views[self.nr_views - 1] = utf8_string | ||||
|         self.views.append(view) | ||||
| 
 | ||||
|     cpdef bint check_flag(self, size_t flag_id) except *: | ||||
|         """Access the value of one of the pre-computed boolean distribution features. | ||||
|  |  | |||
|  | @ -1,8 +1,6 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import lookup | ||||
| from spacy.en import tokenize | ||||
| from spacy.en import unhash | ||||
| from spacy.en import EN | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -16,28 +14,28 @@ def test_close(close_puncts): | |||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 2 | ||||
|         assert unhash(tokens[1].lex) == p | ||||
|         assert unhash(tokens[0].lex) == word_str | ||||
|         assert tokens[1].string == p | ||||
|         assert tokens[0].string == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different_close(close_puncts): | ||||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p + "'" | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert unhash(tokens[0].lex) == word_str | ||||
|         assert unhash(tokens[1].lex) == p | ||||
|         assert unhash(tokens[2].lex) == "'" | ||||
|         assert tokens[0].string == word_str | ||||
|         assert tokens[1].string == p | ||||
|         assert tokens[2].string == "'" | ||||
| 
 | ||||
| 
 | ||||
| def test_three_same_close(close_puncts): | ||||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p + p + p | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 4 | ||||
|         assert unhash(tokens[0].lex) == word_str | ||||
|         assert unhash(tokens[1].lex) == p | ||||
|         assert tokens[0].string == word_str | ||||
|         assert tokens[1].string == p | ||||
|  |  | |||
|  | @ -1,8 +1,6 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import lookup | ||||
| from spacy.en import tokenize | ||||
| from spacy.en import unhash | ||||
| from spacy.en import EN | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -16,35 +14,35 @@ def test_open(open_puncts): | |||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + word_str | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 2 | ||||
|         assert unhash(tokens[0].lex) == p | ||||
|         assert unhash(tokens[1].lex) == word_str | ||||
|         assert tokens[0].string == p | ||||
|         assert tokens[1].string == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different_open(open_puncts): | ||||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + "`" + word_str | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert unhash(tokens[0].lex) == p | ||||
|         assert unhash(tokens[1].lex) == "`" | ||||
|         assert unhash(tokens[2].lex) == word_str | ||||
|         assert tokens[0].string == p | ||||
|         assert tokens[1].string == "`" | ||||
|         assert tokens[2].string == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_three_same_open(open_puncts): | ||||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + p + p + word_str | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 4 | ||||
|         assert unhash(tokens[0].lex) == p | ||||
|         assert unhash(tokens[3].lex) == word_str | ||||
|         assert tokens[0].string == p | ||||
|         assert tokens[3].string == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_open_appostrophe(): | ||||
|     string = "'The" | ||||
|     tokens = tokenize(string) | ||||
|     tokens = EN.tokenize(string) | ||||
|     assert len(tokens) == 2 | ||||
|     assert unhash(tokens[0].lex) == "'" | ||||
|     assert tokens[0].string == "'" | ||||
|  |  | |||
|  | @ -1,8 +1,6 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import tokenize | ||||
| from spacy.en import lookup | ||||
| from spacy.en import unhash | ||||
| from spacy.en import EN | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -16,22 +14,22 @@ def test_token(paired_puncts): | |||
|     word_str = 'Hello' | ||||
|     for open_, close_ in paired_puncts: | ||||
|         string = open_ + word_str + close_ | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert unhash(tokens[0].lex) == open_ | ||||
|         assert unhash(tokens[1].lex) == word_str | ||||
|         assert unhash(tokens[2].lex) == close_ | ||||
|         assert tokens[0].string == open_ | ||||
|         assert tokens[1].string == word_str | ||||
|         assert tokens[2].string == close_ | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different(paired_puncts): | ||||
|     word_str = 'Hello' | ||||
|     for open_, close_ in paired_puncts: | ||||
|         string = "`" + open_ + word_str + close_ + "'" | ||||
|         tokens = tokenize(string) | ||||
|         tokens = EN.tokenize(string) | ||||
|         assert len(tokens) == 5 | ||||
|         assert unhash(tokens[0].lex) == "`" | ||||
|         assert unhash(tokens[1].lex) == open_ | ||||
|         assert unhash(tokens[2].lex) == word_str | ||||
|         assert unhash(tokens[2].lex) == word_str | ||||
|         assert unhash(tokens[3].lex) == close_ | ||||
|         assert unhash(tokens[4].lex) == "'" | ||||
|         assert tokens[0].string == "`" | ||||
|         assert tokens[1].string == open_ | ||||
|         assert tokens[2].string == word_str | ||||
|         assert tokens[2].string == word_str | ||||
|         assert tokens[3].string == close_ | ||||
|         assert tokens[4].string == "'" | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user