mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	* Refactoring with Lexeme as a class now compiles. Basic design seems to work
This commit is contained in:
		
							parent
							
								
									68bae2fec6
								
							
						
					
					
						commit
						e9a62b6eba
					
				
							
								
								
									
										14
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -1,4 +1,4 @@ | ||||||
| from spacy.spacy cimport Language | from spacy.lang cimport Language | ||||||
| from spacy.word cimport Lexeme | from spacy.word cimport Lexeme | ||||||
| cimport cython | cimport cython | ||||||
| 
 | 
 | ||||||
|  | @ -31,12 +31,14 @@ cpdef size_t POS | ||||||
| cpdef size_t PRON | cpdef size_t PRON | ||||||
| cpdef size_t PRT | cpdef size_t PRT | ||||||
| 
 | 
 | ||||||
| cdef class English(spacy.Language): | cpdef size_t SIC | ||||||
|     cdef int find_split(self, unicode word) | cpdef size_t CANON_CASED | ||||||
|  | cpdef size_t SHAPE | ||||||
|  | cpdef size_t NON_SPARSE | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef English EN | cdef class English(Language): | ||||||
|  |     cpdef int _split_one(self, unicode word) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef Word lookup(unicode word) | cpdef English EN | ||||||
| cpdef list tokenize(unicode string) |  | ||||||
|  |  | ||||||
							
								
								
									
										189
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										189
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -31,6 +31,7 @@ same scheme. Tokenization problems are a major cause of poor performance for | ||||||
| NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module | NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module | ||||||
| provides a fully Penn Treebank 3-compliant tokenizer. | provides a fully Penn Treebank 3-compliant tokenizer. | ||||||
| ''' | ''' | ||||||
|  | # TODO | ||||||
| #The script translate_treebank_tokenization can be used to transform a treebank's | #The script translate_treebank_tokenization can be used to transform a treebank's | ||||||
| #annotation to use one of the spacy tokenization schemes. | #annotation to use one of the spacy tokenization schemes. | ||||||
| 
 | 
 | ||||||
|  | @ -40,90 +41,14 @@ from __future__ import unicode_literals | ||||||
| from libc.stdlib cimport malloc, calloc, free | from libc.stdlib cimport malloc, calloc, free | ||||||
| from libc.stdint cimport uint64_t | from libc.stdint cimport uint64_t | ||||||
| 
 | 
 | ||||||
| cimport spacy | cimport lang | ||||||
| 
 | 
 | ||||||
| 
 | from spacy import orth | ||||||
| # Python-readable flag constants --- can't read an enum from Python |  | ||||||
| 
 |  | ||||||
| # Don't want to manually assign these numbers, or we'll insert one and have to |  | ||||||
| # change them all. |  | ||||||
| # Don't use "i", as we don't want it in the global scope! |  | ||||||
| cdef size_t __i = 0 |  | ||||||
| 
 |  | ||||||
| ALPHA = __i; i += 1 |  | ||||||
| DIGIT = __i; __i += 1 |  | ||||||
| PUNCT = __i; __i += 1 |  | ||||||
| SPACE = __i; __i += 1 |  | ||||||
| LOWER = __i; __i += 1 |  | ||||||
| UPPER = __i; __i += 1 |  | ||||||
| TITLE = __i; __i += 1 |  | ||||||
| ASCII = __i; __i += 1 |  | ||||||
| 
 |  | ||||||
| OFT_LOWER = __i; __i += 1  |  | ||||||
| OFT_UPPER = __i; __i += 1 |  | ||||||
| OFT_TITLE = __i; __i += 1 |  | ||||||
| 
 |  | ||||||
| PUNCT = __i; __i += 1 |  | ||||||
| CONJ = __i; __i += 1 |  | ||||||
| NUM = __i; __i += 1 |  | ||||||
| X = __i; __i += 1 |  | ||||||
| DET = __i; __i += 1 |  | ||||||
| ADP = __i; __i += 1 |  | ||||||
| ADJ = __i; __i += 1 |  | ||||||
| ADV = __i; __i += 1 |  | ||||||
| VERB = __i; __i += 1 |  | ||||||
| NOUN = __i; __i += 1 |  | ||||||
| PDT = __i; __i += 1 |  | ||||||
| POS = __i; __i += 1 |  | ||||||
| PRON = __i; __i += 1 |  | ||||||
| PRT = __i; __i += 1 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # These are for the string views |  | ||||||
| __i = 0 |  | ||||||
| SIC = __i; __i += 1 |  | ||||||
| CANON_CASED = __i; __i += 1 |  | ||||||
| NON_SPARSE = __i; __i += 1 |  | ||||||
| SHAPE = __i; __i += 1 |  | ||||||
| NR_STRING_VIEWS = __i |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_string_views(unicode string, lexeme): |  | ||||||
|     views = ['' for _ in range(NR_STRING_VIEWS)] |  | ||||||
|     views[SIC] = string |  | ||||||
|     views[CANON_CASED] = canonicalize_case(string, lexeme) |  | ||||||
|     views[SHAPE] = get_string_shape(string) |  | ||||||
|     views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE], |  | ||||||
|                                        lexeme) |  | ||||||
|     return views |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def set_orth_flags(unicode string, flags_t flags) |  | ||||||
|     setters = [ |  | ||||||
|         (ALPHA, is_alpha), |  | ||||||
|         (DIGIT, is_digit), |  | ||||||
|         (PUNCT, is_punct), |  | ||||||
|         (SPACE, is_space), |  | ||||||
|         (LOWER, is_lower), |  | ||||||
|         (UPPER, is_upper), |  | ||||||
|         (SPACE, is_space) |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     for bit, setter in setters: |  | ||||||
|         if setter(string): |  | ||||||
|             flags |= 1 << bit |  | ||||||
|     return flags |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | cdef class English(Language): | ||||||
| cdef class English(spacy.Language): |     cpdef int _split_one(self, unicode word): | ||||||
|     cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None, |  | ||||||
|                            tag_freqs=None): |  | ||||||
|         return Lexeme(s, length, views, prob=prob, cluster=cluster, |  | ||||||
|                       flags=self.get_flags(string)) |  | ||||||
| 
 |  | ||||||
|     cdef int find_split(self, unicode word): |  | ||||||
|         cdef size_t length = len(word) |         cdef size_t length = len(word) | ||||||
|         cdef int i = 0 |         cdef int i = 0 | ||||||
|         if word.startswith("'s") or word.startswith("'S"): |         if word.startswith("'s") or word.startswith("'S"): | ||||||
|  | @ -132,17 +57,16 @@ cdef class English(spacy.Language): | ||||||
|         if word.endswith("'s") and length >= 3: |         if word.endswith("'s") and length >= 3: | ||||||
|             return length - 2 |             return length - 2 | ||||||
|         # Leading punctuation |         # Leading punctuation | ||||||
|         if check_punct(word, 0, length): |         if _check_punct(word, 0, length): | ||||||
|             return 1 |             return 1 | ||||||
|         elif length >= 1: |         elif length >= 1: | ||||||
|             # Split off all trailing punctuation characters |             # Split off all trailing punctuation characters | ||||||
|             i = 0 |             i = 0 | ||||||
|             while i < length and not check_punct(word, i, length): |             while i < length and not _check_punct(word, i, length): | ||||||
|                 i += 1 |                 i += 1 | ||||||
|         return i |         return i | ||||||
| 
 | 
 | ||||||
| 
 | cdef bint _check_punct(unicode word, size_t i, size_t length): | ||||||
| cdef bint check_punct(unicode word, size_t i, size_t length): |  | ||||||
|     # Don't count appostrophes as punct if the next char is a letter |     # Don't count appostrophes as punct if the next char is a letter | ||||||
|     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): |     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): | ||||||
|         return i == 0 |         return i == 0 | ||||||
|  | @ -160,69 +84,46 @@ cdef bint check_punct(unicode word, size_t i, size_t length): | ||||||
| EN = English('en') | EN = English('en') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef list tokenize(unicode string): | # Thresholds for frequency related flags | ||||||
|     """Tokenize a string. | TAG_THRESH = 0.5 | ||||||
| 
 | LOWER_THRESH = 0.5 | ||||||
|     The tokenization rules are defined in two places: | UPPER_THRESH = 0.3 | ||||||
| 
 | TITLE_THRESH = 0.9 | ||||||
|     * The data/en/tokenization table, which handles special cases like contractions; |  | ||||||
|     * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc. |  | ||||||
| 
 |  | ||||||
|     Args: |  | ||||||
|         string (unicode): The string to be tokenized.  |  | ||||||
| 
 |  | ||||||
|     Returns: |  | ||||||
|         tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. |  | ||||||
|     """ |  | ||||||
|     return EN.tokenize(string) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef Lexeme lookup(unicode string): | # Python-readable flag constants --- can't read an enum from Python | ||||||
|     """Retrieve (or create, if not found) a Lexeme for a string, and return its ID. | ALPHA = EN.lexicon.add_flag(orth.is_alpha) | ||||||
|  | DIGIT = EN.lexicon.add_flag(orth.is_digit) | ||||||
|  | PUNCT = EN.lexicon.add_flag(orth.is_punct) | ||||||
|  | SPACE = EN.lexicon.add_flag(orth.is_space) | ||||||
|  | PUNCT = EN.lexicon.add_flag(orth.is_punct) | ||||||
|  | ASCII = EN.lexicon.add_flag(orth.is_ascii) | ||||||
|  | TITLE = EN.lexicon.add_flag(orth.is_title) | ||||||
|  | LOWER = EN.lexicon.add_flag(orth.is_lower) | ||||||
|  | UPPER = EN.lexicon.add_flag(orth.is_upper) | ||||||
| 
 | 
 | ||||||
|     Properties of the Lexeme are accessed by passing LexID to the accessor methods. | OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH)) | ||||||
|     Access is cheap/free, as the LexID is the memory address of the Lexeme. | OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH)) | ||||||
|  | OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH)) | ||||||
| 
 | 
 | ||||||
|     Args: | CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH)) | ||||||
|         string (unicode):  The string to be looked up. Must be unicode, not bytes. | CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH)) | ||||||
| 
 | CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH)) | ||||||
|     Returns: | CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH)) | ||||||
|         lexeme (LexID): A reference to a lexical type. | CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH)) | ||||||
|     """ | CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH)) | ||||||
|     return EN.lookup(string) | CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH)) | ||||||
|  | CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH)) | ||||||
|  | CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH)) | ||||||
|  | CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH)) | ||||||
|  | CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH)) | ||||||
|  | CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH)) | ||||||
|  | CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH)) | ||||||
|  | CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def add_string_views(view_funcs): | # These are the name of string transforms | ||||||
|     """Add a string view to existing and previous lexical entries. | SIC = EN.lexicon.add_transform(orth.sic_string) | ||||||
| 
 | CANON_CASED = EN.lexicon.add_transform(orth.canon_case) | ||||||
|     Args: | SHAPE = EN.lexicon.add_transform(orth.word_shape) | ||||||
|         get_view (function): A unicode --> unicode function. | NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse) | ||||||
| 
 |  | ||||||
|     Returns: |  | ||||||
|         view_id (int): An integer key you can use to access the view. |  | ||||||
|     """ |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_clusters(location): |  | ||||||
|     """Load cluster data. |  | ||||||
|     """ |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_unigram_probs(location): |  | ||||||
|     """Load unigram probabilities. |  | ||||||
|     """ |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_case_stats(location): |  | ||||||
|     """Load case stats. |  | ||||||
|     """ |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_tag_stats(location): |  | ||||||
|     """Load tag statistics. |  | ||||||
|     """ |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  | @ -3,18 +3,23 @@ from libc.stdint cimport uint64_t | ||||||
| from spacy.word cimport Lexeme | from spacy.word cimport Lexeme | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef class Lexicon: | ||||||
|  |     cdef public list flag_checkers | ||||||
|  |     cdef public list string_transformers | ||||||
|  | 
 | ||||||
|  |     cdef dict lexicon | ||||||
|  | 
 | ||||||
|  |     cpdef Lexeme lookup(self, unicode string) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef class Language: | cdef class Language: | ||||||
|     cdef object name |     cdef object name | ||||||
|     cdef dict blobs |     cdef dict cache | ||||||
|     cdef dict lexicon |     cpdef readonly Lexicon lexicon | ||||||
| 
 | 
 | ||||||
|     cpdef list tokenize(self, unicode text) |     cpdef list tokenize(self, unicode text) | ||||||
| 
 | 
 | ||||||
|     cdef Word lookup(self, unicode string) |     cdef list _tokenize(self, unicode string) | ||||||
|     cdef list lookup_chunk(self, unicode chunk) |     cpdef list _split(self, unicode string) | ||||||
|  |     cpdef int _split_one(self, unicode word) | ||||||
|      |      | ||||||
|     cdef list new_chunk(self, unicode string, list substrings) |  | ||||||
|     cdef Word new_lexeme(self, unicode lex) |  | ||||||
|      |  | ||||||
|     cpdef list find_substrings(self, unicode chunk) |  | ||||||
|     cdef int find_split(self, unicode word) |  | ||||||
|  |  | ||||||
							
								
								
									
										206
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							
							
						
						
									
										206
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							|  | @ -6,37 +6,37 @@ Provides the main implementation for the spacy tokenizer. Specific languages | ||||||
| subclass the Language class, over-writing the tokenization rules as necessary. | subclass the Language class, over-writing the tokenization rules as necessary. | ||||||
| Special-case tokenization rules are read from data/<lang>/tokenization . | Special-case tokenization rules are read from data/<lang>/tokenization . | ||||||
| """ | """ | ||||||
| 
 |  | ||||||
|   |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from libc.stdlib cimport calloc, free | from libc.stdlib cimport calloc, free | ||||||
| 
 | 
 | ||||||
| from . import util | from . import util | ||||||
|  | import json | ||||||
| from os import path | from os import path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Language: | cdef class Language: | ||||||
|     view_funcs = [] |  | ||||||
|     def __cinit__(self, name): |     def __cinit__(self, name): | ||||||
|         self.name = name |         self.name = name | ||||||
|         self.blobs = {} |         self.cache = {} | ||||||
|         self.lexicon = {} |         self.lexicon = Lexicon() | ||||||
|         self.load_tokenization(util.read_tokenization(name)) |         self.load_tokenization(util.read_tokenization(name)) | ||||||
|         self.load_dist_info(util.read_dist_info(name)) |  | ||||||
| 
 | 
 | ||||||
|     cpdef list tokenize(self, unicode string): |     cpdef list tokenize(self, unicode string): | ||||||
|         """Tokenize. |         """Tokenize a string. | ||||||
| 
 | 
 | ||||||
|         Split the string into tokens. |         The tokenization rules are defined in two places: | ||||||
|  | 
 | ||||||
|  |         * The data/<lang>/tokenization table, which handles special cases like contractions; | ||||||
|  |         * The appropriate :py:meth:`find_split` function, which is used to split | ||||||
|  |           off punctuation etc. | ||||||
| 
 | 
 | ||||||
|         Args: |         Args: | ||||||
|             string (unicode): The string to split. |             string (unicode): The string to be tokenized.  | ||||||
| 
 | 
 | ||||||
|         Returns: |         Returns: | ||||||
|             tokens (list): A list of Lexeme objects. |             tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. | ||||||
|         """ |         """ | ||||||
|         cdef list blob |  | ||||||
|         cdef list tokens = [] |         cdef list tokens = [] | ||||||
|         cdef size_t length = len(string) |         cdef size_t length = len(string) | ||||||
|         cdef size_t start = 0 |         cdef size_t start = 0 | ||||||
|  | @ -44,74 +44,28 @@ cdef class Language: | ||||||
|         for c in string: |         for c in string: | ||||||
|             if c == ' ': |             if c == ' ': | ||||||
|                 if start < i: |                 if start < i: | ||||||
|                     blob = self.lookup_blob(string[start:i]) |                     tokens.extend(self._tokenize(string[start:i])) | ||||||
|                     tokens.extend(blob) |  | ||||||
|                 start = i + 1 |                 start = i + 1 | ||||||
|             i += 1 |             i += 1 | ||||||
|         if start < i: |         if start < i: | ||||||
|             chunk = self.lookup_blob(string[start:]) |             tokens.extend(self._tokenize(string[start:])) | ||||||
|             tokens.extend(chunk) |  | ||||||
|         return tokens |         return tokens | ||||||
| 
 | 
 | ||||||
|     cdef Lexeme lookup(self, unicode string): |     cdef list _tokenize(self, unicode string): | ||||||
|         assert len(string) != 0 |         if string in self.cache: | ||||||
|         cdef Word word  |             return self.cache[string] | ||||||
|         if string in self.vocab: |         cdef list lexemes = [] | ||||||
|             word = self.vocab[string] |         substrings = self._split(string) | ||||||
|         else: |  | ||||||
|             word = self.new_lexeme(string) |  | ||||||
|         return word |  | ||||||
| 
 |  | ||||||
|     cdef list lookup_blob(self, unicode string): |  | ||||||
|         cdef list chunk |  | ||||||
|         cdef size_t blob_id |  | ||||||
|         if string in self.blobs: |  | ||||||
|             blob = self.blobs[string] |  | ||||||
|         else: |  | ||||||
|             blob = self.new_blob(string, self.find_substrings(string)) |  | ||||||
|         return chunk |  | ||||||
| 
 |  | ||||||
|     cdef list new_blob(self, unicode string, list substrings): |  | ||||||
|         blob = [] |  | ||||||
|         for i, substring in enumerate(substrings): |         for i, substring in enumerate(substrings): | ||||||
|             blob.append(self.lookup(substring)) |             lexemes.append(self.lookup(substring)) | ||||||
|         self.blobs[string] = chunk |         self.cache[string] = lexemes | ||||||
|         return blob |         return lexemes | ||||||
| 
 | 
 | ||||||
|     cdef Word new_lexeme(self, unicode string): |     cpdef list _split(self, unicode string): | ||||||
|         # TODO |         """Find how to split a contiguous span of non-space characters into substrings. | ||||||
|         #lexeme = Lexeme(string.encode('utf8'), string_views) |  | ||||||
|         #return lexeme |  | ||||||
| 
 |  | ||||||
|     """ |  | ||||||
|     def add_view_funcs(self, list view_funcs): |  | ||||||
|         self.view_funcs.extend(view_funcs) |  | ||||||
|         cdef size_t nr_views = len(self.view_funcs) |  | ||||||
| 
 |  | ||||||
|         cdef unicode view |  | ||||||
|         cdef StringHash hashed |  | ||||||
|         cdef StringHash key |  | ||||||
|         cdef unicode string |  | ||||||
|         cdef LexID lex_id |  | ||||||
|         cdef Lexeme* word |  | ||||||
| 
 |  | ||||||
|         for key, lex_id in self.vocab.items(): |  | ||||||
|             word = <Lexeme*>lex_id |  | ||||||
|             free(word.string_views) |  | ||||||
|             word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash)) |  | ||||||
|             string = word.string[:word.length].decode('utf8') |  | ||||||
|             for i, view_func in enumerate(self.view_funcs): |  | ||||||
|                 view = view_func(string) |  | ||||||
|                 hashed = hash(view) |  | ||||||
|                 word.string_views[i] = hashed |  | ||||||
|                 self.bacov[hashed] = view |  | ||||||
|     """ |  | ||||||
| 
 |  | ||||||
|     cpdef list find_substrings(self, unicode blob): |  | ||||||
|         """Find how to split a chunk into substrings. |  | ||||||
| 
 | 
 | ||||||
|         This method calls find_split repeatedly. Most languages will want to |         This method calls find_split repeatedly. Most languages will want to | ||||||
|         override find_split, but it may be useful to override this instead. |         override _split_one, but it may be useful to override this instead. | ||||||
| 
 | 
 | ||||||
|         Args: |         Args: | ||||||
|             chunk (unicode): The string to be split, e.g. u"Mike's!" |             chunk (unicode): The string to be split, e.g. u"Mike's!" | ||||||
|  | @ -120,22 +74,22 @@ cdef class Language: | ||||||
|             substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. |             substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. | ||||||
|         """ |         """ | ||||||
|         substrings = [] |         substrings = [] | ||||||
|         while blob: |         while string: | ||||||
|             split = self.find_split(blob) |             split = self._split_one(string) | ||||||
|             if split == 0: |             if split == 0: | ||||||
|                 substrings.append(blob) |                 substrings.append(string) | ||||||
|                 break |                 break | ||||||
|             substrings.append(blob[:split]) |             substrings.append(string[:split]) | ||||||
|             blob = blob[split:] |             string = string[split:] | ||||||
|         return substrings |         return substrings | ||||||
| 
 | 
 | ||||||
|     cdef int find_split(self, unicode word): |     cpdef int _split_one(self, unicode word): | ||||||
|         return len(word) |         return len(word) | ||||||
| 
 | 
 | ||||||
|     def load_tokenization(self, token_rules): |     def load_special_tokenization(self, token_rules): | ||||||
|         '''Load special-case tokenization rules. |         '''Load special-case tokenization rules. | ||||||
| 
 | 
 | ||||||
|         Loads special-case tokenization rules into the Language.chunk cache, |         Loads special-case tokenization rules into the Language.cache cache, | ||||||
|         read from data/<lang>/tokenization . The special cases are loaded before |         read from data/<lang>/tokenization . The special cases are loaded before | ||||||
|         any language data is tokenized, giving these priority.  For instance, |         any language data is tokenized, giving these priority.  For instance, | ||||||
|         the English tokenization rules map "ain't" to ["are", "not"]. |         the English tokenization rules map "ain't" to ["are", "not"]. | ||||||
|  | @ -144,25 +98,83 @@ cdef class Language: | ||||||
|             token_rules (list): A list of (chunk, tokens) pairs, where chunk is |             token_rules (list): A list of (chunk, tokens) pairs, where chunk is | ||||||
|                 a string and tokens is a list of strings. |                 a string and tokens is a list of strings. | ||||||
|         ''' |         ''' | ||||||
|         for chunk, tokens in token_rules: |         for string, substrings in token_rules: | ||||||
|             self.new_chunk(chunk, tokens) |             lexemes = [] | ||||||
|  |             for i, substring in enumerate(substrings): | ||||||
|  |                 lexemes.append(self.lookup(substring)) | ||||||
|  |             self.cache[string] = lexemes | ||||||
|   |   | ||||||
|     def load_dist_info(self, dist_info): |  | ||||||
|         '''Load distributional information for the known lexemes of the language. |  | ||||||
| 
 | 
 | ||||||
|         The distributional information is read from data/<lang>/dist_info.json . | cdef class Lexicon: | ||||||
|         It contains information like the (smoothed) unigram log probability of |     def __cinit__(self): | ||||||
|         the word, how often the word is found upper-cased, how often the word |         self.flag_checkers = [] | ||||||
|         is found title-cased, etc. |         self.string_transforms = [] | ||||||
|         ''' |         self.lexicon = {} | ||||||
|  | 
 | ||||||
|  |     cpdef Lexeme lookup(self, unicode string): | ||||||
|  |         """Retrieve (or create, if not found) a Lexeme for a string, and return it. | ||||||
|  |      | ||||||
|  |         Args: | ||||||
|  |             string (unicode):  The string to be looked up. Must be unicode, not bytes. | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             lexeme (Lexeme): A reference to a lexical type. | ||||||
|  |         """ | ||||||
|  |         assert len(string) != 0 | ||||||
|  |         if string in self.lexicon: | ||||||
|  |             return self.lexicon[string] | ||||||
|  |          | ||||||
|  |         prob = _pop_default(self.probs, string, 0.0) | ||||||
|  |         cluster = _pop_default(self.clusters, string, 0.0) | ||||||
|  |         case_stats = _pop_default(self.case_stats, string, {}) | ||||||
|  |         tag_stats = _pop_default(self.tag_stats, string, {}) | ||||||
|  | 
 | ||||||
|  |         cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats, | ||||||
|  |                                   self.flag_checkers, self.string_transformers) | ||||||
|  |         self.lexicon[string] = word | ||||||
|  |         return word | ||||||
|  | 
 | ||||||
|  |     def add_flag(self, flag_checker): | ||||||
|         cdef unicode string |         cdef unicode string | ||||||
|         cdef dict word_dist |         cdef Lexeme word | ||||||
|         cdef Word w |         flag_id = len(self.flag_checkers) | ||||||
|         for string, word_dist in dist_info.items(): |         for string, word in self.lexicon.items(): | ||||||
|             w = self.lookup(string) |             if flag_checker(string, word.prob, {}): | ||||||
|             w.prob = word_dist.prob |                 word.set_flag(flag_id) | ||||||
|             w.cluster = word_dist.cluster |         self.flag_checkers.append(flag_checker) | ||||||
|             for flag in word_dist.flags: |         return flag_id | ||||||
|                 w.dist_flags |= DIST_FLAGS[flag] | 
 | ||||||
|             for tag in word_dist.tagdict: |     def add_transform(self, string_transform): | ||||||
|                 w.possible_tags |= TAGS[tag] |         self.string_transformers.append(string_transform) | ||||||
|  |         return len(self.string_transformers) - 1 | ||||||
|  | 
 | ||||||
|  |     def load_probs(self, location): | ||||||
|  |         """Load unigram probabilities. | ||||||
|  |         """ | ||||||
|  |         self.probs = json.load(location) | ||||||
|  |          | ||||||
|  |         cdef Lexeme word | ||||||
|  |         cdef unicode string | ||||||
|  | 
 | ||||||
|  |         for string, word in self.lexicon.items(): | ||||||
|  |             prob = _pop_default(self.probs, string, 0.0) | ||||||
|  |             word.prob = prob | ||||||
|  | 
 | ||||||
|  |     def load_clusters(self, location): | ||||||
|  |         self.probs = json.load(location) | ||||||
|  |          | ||||||
|  |         cdef Lexeme word | ||||||
|  |         cdef unicode string | ||||||
|  | 
 | ||||||
|  |         for string, word in self.lexicon.items(): | ||||||
|  |             cluster = _pop_default(self.cluster, string, 0) | ||||||
|  |             word.cluster = cluster | ||||||
|  | 
 | ||||||
|  |     def load_stats(self, location): | ||||||
|  |         """Load distributional stats. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _pop_default(dict d, key, default): | ||||||
|  |     return d.pop(key) if key in d else default | ||||||
|  |  | ||||||
|  | @ -1,54 +0,0 @@ | ||||||
| import os |  | ||||||
| from os import path |  | ||||||
| import codecs |  | ||||||
| import json |  | ||||||
| 
 |  | ||||||
| DATA_DIR = path.join(path.dirname(__file__), '..', 'data') |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def utf8open(loc, mode='r'): |  | ||||||
|     return codecs.open(loc, mode, 'utf8') |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_case_stats(data_dir): |  | ||||||
|     case_loc = path.join(data_dir, 'case') |  | ||||||
|     case_stats = {} |  | ||||||
|     with utf8open(case_loc) as cases_file: |  | ||||||
|         for line in cases_file: |  | ||||||
|             word, upper, title = line.split() |  | ||||||
|             case_stats[word] = (float(upper), float(title)) |  | ||||||
|     return case_stats |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def read_dist_info(lang): |  | ||||||
|     dist_path = path.join(DATA_DIR, lang, 'distribution_info.json') |  | ||||||
|     if path.exists(dist_path): |  | ||||||
|         with open(dist_path) as file_: |  | ||||||
|             dist_info = json.load(file_) |  | ||||||
|     else: |  | ||||||
|         dist_info = {} |  | ||||||
|     return dist_info |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def read_tokenization(lang): |  | ||||||
|     loc = path.join(DATA_DIR, lang, 'tokenization') |  | ||||||
|     entries = [] |  | ||||||
|     seen = set() |  | ||||||
|     with utf8open(loc) as file_: |  | ||||||
|         for line in file_: |  | ||||||
|             line = line.strip() |  | ||||||
|             if line.startswith('#'): |  | ||||||
|                 continue |  | ||||||
|             if not line: |  | ||||||
|                 continue |  | ||||||
|             pieces = line.split() |  | ||||||
|             chunk = pieces.pop(0) |  | ||||||
|             assert chunk not in seen, chunk |  | ||||||
|             seen.add(chunk) |  | ||||||
|             entries.append((chunk, list(pieces))) |  | ||||||
|             if chunk[0].isalpha() and chunk[0].islower(): |  | ||||||
|                 chunk = chunk[0].title() + chunk[1:] |  | ||||||
|                 pieces[0] = pieces[0][0].title() + pieces[0][1:] |  | ||||||
|                 seen.add(chunk) |  | ||||||
|                 entries.append((chunk, pieces)) |  | ||||||
|     return entries |  | ||||||
|  | @ -7,19 +7,19 @@ DEF MAX_FLAG = 64 | ||||||
| cdef class Lexeme: | cdef class Lexeme: | ||||||
|     # NB: the readonly keyword refers to _Python_ access. The attributes are |     # NB: the readonly keyword refers to _Python_ access. The attributes are | ||||||
|     # writeable from Cython. |     # writeable from Cython. | ||||||
|     cdef readonly id_t id |     cpdef readonly id_t id | ||||||
|     cdef readonly size_t length |     cpdef readonly size_t length | ||||||
|     cdef readonly double prob |     cpdef readonly double prob | ||||||
|     cdef readonly size_t cluster |     cpdef readonly size_t cluster | ||||||
| 
 | 
 | ||||||
|     cdef readonly utf8_t* strings |     cdef utf8_t* views | ||||||
|     cdef readonly size_t nr_strings |     cdef size_t nr_views | ||||||
| 
 | 
 | ||||||
|     cdef readonly flag_t flags |     cdef readonly flag_t flags | ||||||
| 
 | 
 | ||||||
|     cpdef bint check_flag(self, size_t flag_id) except * |     cpdef bint check_flag(self, size_t flag_id) except * | ||||||
|     cpdef int set_flag(self, size_t flag_id) except -1 |     cpdef int set_flag(self, size_t flag_id) except -1 | ||||||
|      |      | ||||||
|     cpdef unicode get_string(self, size_t i) except * |     cpdef unicode get_view_string(self, size_t i) | ||||||
|     cpdef id_t get_id(self, size_t i) except 0 |     cpdef id_t get_view_id(self, size_t i) except 0 | ||||||
|     cpdef int add_strings(self, list strings) except -1 |     cpdef int add_view(self, unicode view) except -1 | ||||||
|  |  | ||||||
							
								
								
									
										172
									
								
								spacy/word.pyx
									
									
									
									
									
								
							
							
						
						
									
										172
									
								
								spacy/word.pyx
									
									
									
									
									
								
							|  | @ -2,10 +2,7 @@ | ||||||
| # cython: embedsignature=True | # cython: embedsignature=True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| from libc.stdlib cimport calloc, free | from libc.stdlib cimport calloc, free, realloc | ||||||
| 
 |  | ||||||
| from spacy cimport flags |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| cdef class Lexeme: | cdef class Lexeme: | ||||||
|     """A lexical type. |     """A lexical type. | ||||||
|  | @ -53,7 +50,7 @@ cdef class Lexeme: | ||||||
|             the same cluster ID as "pineapple", which is not what we'd like. |             the same cluster ID as "pineapple", which is not what we'd like. | ||||||
|     """ |     """ | ||||||
|     def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, |     def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, | ||||||
|                   cluster=0, orth_flags=0, dist_flags=0, possible_tags=0): |                   flags=0): | ||||||
|         self.id = <id_t>&string |         self.id = <id_t>&string | ||||||
|         self.length = length |         self.length = length | ||||||
|         self.nr_strings = 0 |         self.nr_strings = 0 | ||||||
|  | @ -66,25 +63,21 @@ cdef class Lexeme: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self.strings[0].decode('utf8') |             return self.strings[0].decode('utf8') | ||||||
| 
 | 
 | ||||||
|     cpdef unicode get_view_string(self, size_t i) except *: |     cpdef unicode get_view_string(self, size_t i): | ||||||
|         assert i < self.nr_strings |         assert i < self.nr_strings | ||||||
|         return self.strings[i].decode('utf8') |         return self.strings[i].decode('utf8') | ||||||
| 
 | 
 | ||||||
|     cpdef intptr_t get_view_id(self, size_t i) except 0: |     cpdef id_t get_view_id(self, size_t i) except 0: | ||||||
|         assert i < self.nr_strings |         assert i < self.nr_strings | ||||||
|         return <string_id_t>&self.views[i] |         return <id_t>&self.views[i] | ||||||
| 
 | 
 | ||||||
|     cpdef int add_views(self, list views) except -1: |     cpdef int add_view(self, unicode view) except -1: | ||||||
|         self.nr_views += len(strings) |         self.nr_views += 1 | ||||||
|         self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t)) |         self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t)) | ||||||
|         cdef unicode view |         cdef bytes utf8_string = view.encode('utf8') | ||||||
|         cdef bytes utf8_string |  | ||||||
|         for i, view in enumerate(strings): |  | ||||||
|             view = string_views[i] |  | ||||||
|             utf8_string = view.encode('utf8') |  | ||||||
|         # Intern strings, allowing pointer comparison |         # Intern strings, allowing pointer comparison | ||||||
|         utf8_string = intern(utf8_string) |         utf8_string = intern(utf8_string) | ||||||
|             self.views[i] = utf8_string |         self.views[self.nr_views - 1] = utf8_string | ||||||
| 
 | 
 | ||||||
|     cpdef bint check_flag(self, size_t flag_id) except *: |     cpdef bint check_flag(self, size_t flag_id) except *: | ||||||
|         """Access the value of one of the pre-computed boolean distribution features. |         """Access the value of one of the pre-computed boolean distribution features. | ||||||
|  | @ -92,154 +85,7 @@ cdef class Lexeme: | ||||||
|         Meanings depend on the language-specific distributional features being loaded. |         Meanings depend on the language-specific distributional features being loaded. | ||||||
|         The suggested features for latin-alphabet languages are: TODO |         The suggested features for latin-alphabet languages are: TODO | ||||||
|         """ |         """ | ||||||
|         assert flag_id < flags.MAX_FLAG |  | ||||||
|         return self.flags & (1 << flag_id) |         return self.flags & (1 << flag_id) | ||||||
| 
 | 
 | ||||||
|     cpdef int set_flag(self, size_t flag_id) except -1: |     cpdef int set_flag(self, size_t flag_id) except -1: | ||||||
|         assert flag_id < flags.MAX_FLAG |  | ||||||
|         self.flags |= (1 << flag_id) |         self.flags |= (1 << flag_id) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # |  | ||||||
| #cdef class CasedWord(Word): |  | ||||||
| #    def __cinit__(self, bytes string, list views): |  | ||||||
| #        Word.__cinit__(self, string, string_views) |  | ||||||
| #     |  | ||||||
| #    cpdef bint is_often_uppered(self) except *: |  | ||||||
| #        '''Check the OFT_UPPER distributional flag for the word. |  | ||||||
| #     |  | ||||||
| #        The OFT_UPPER flag records whether a lower-cased version of the word |  | ||||||
| #        is found in all-upper case frequently in a large sample of text, where |  | ||||||
| #        "frequently" is defined as P >= 0.95 (chosen for high mutual information for |  | ||||||
| #        POS tagging). |  | ||||||
| #     |  | ||||||
| #        Case statistics are estimated from a large text corpus. Estimates are read |  | ||||||
| #        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. |  | ||||||
| #     |  | ||||||
| #        >>> is_often_uppered(lookup(u'nato')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_often_uppered(lookup(u'the'))  |  | ||||||
| #        False |  | ||||||
| #        ''' |  | ||||||
| #        return self.dist_flags & (1 << OFT_UPPER) |  | ||||||
| # |  | ||||||
| # |  | ||||||
| #    cpdef bint is_often_titled(self) except *: |  | ||||||
| #        '''Check the OFT_TITLE distributional flag for the word. |  | ||||||
| #     |  | ||||||
| #        The OFT_TITLE flag records whether a lower-cased version of the word |  | ||||||
| #        is found title-cased (see string.istitle) frequently in a large sample of text, |  | ||||||
| #        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for |  | ||||||
| #        POS tagging). |  | ||||||
| #     |  | ||||||
| #        Case statistics are estimated from a large text corpus. Estimates are read |  | ||||||
| #        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. |  | ||||||
| #     |  | ||||||
| #        >>> is_oft_upper(lookup(u'john')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_oft_upper(lookup(u'Bill'))  |  | ||||||
| #        False |  | ||||||
| #        ''' |  | ||||||
| #        return self.dist_flags & (1 << OFT_TITLE) |  | ||||||
| # |  | ||||||
| # |  | ||||||
| #    cpdef bint is_alpha(self) except *: |  | ||||||
| #        """Check whether all characters in the word's string are alphabetic. |  | ||||||
| #         |  | ||||||
| #        Should match the :py:func:`unicode.isalpha()` function. |  | ||||||
| # |  | ||||||
| #        >>> is_alpha(lookup(u'Hello')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_alpha(lookup(u'العرب')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_alpha(lookup(u'10')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_ALPHA |  | ||||||
| # |  | ||||||
| #    cpdef bint is_digit(self) except *: |  | ||||||
| #        """Check whether all characters in the word's string are numeric. |  | ||||||
| #     |  | ||||||
| #        Should match the :py:func:`unicode.isdigit()` function. |  | ||||||
| # |  | ||||||
| #        >>> is_digit(lookup(u'10')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_digit(lookup(u'๐')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_digit(lookup(u'one')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_DIGIT |  | ||||||
| # |  | ||||||
| #    cpdef bint is_punct(self) except *: |  | ||||||
| #        """Check whether all characters belong to a punctuation unicode data category |  | ||||||
| #        for a Lexeme ID. |  | ||||||
| # |  | ||||||
| #        >>> is_punct(lookup(u'.')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_punct(lookup(u'⁒')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_punct(lookup(u' ')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_PUNCT |  | ||||||
| # |  | ||||||
| #    cpdef bint is_space(self) except *: |  | ||||||
| #        """Give the result of unicode.isspace() for a Lexeme ID. |  | ||||||
| # |  | ||||||
| #        >>> is_space(lookup(u'\\t')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_space(lookup(u'<unicode space>')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_space(lookup(u'Hi\\n')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_SPACE |  | ||||||
| # |  | ||||||
| #    cpdef bint is_lower(self) except *: |  | ||||||
| #        """Give the result of unicode.islower() for a Lexeme ID. |  | ||||||
| # |  | ||||||
| #        >>> is_lower(lookup(u'hi')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_lower(lookup(<unicode>)) |  | ||||||
| #        True |  | ||||||
| #        >>> is_lower(lookup(u'10')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_LOWER |  | ||||||
| # |  | ||||||
| #    cpdef bint is_upper(self) except *: |  | ||||||
| #        """Give the result of unicode.isupper() for a Lexeme ID. |  | ||||||
| # |  | ||||||
| #        >>> is_upper(lookup(u'HI')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_upper(lookup(u'H10')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_upper(lookup(u'10')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_UPPER |  | ||||||
| # |  | ||||||
| #    cpdef bint is_title(self) except *: |  | ||||||
| #        """Give the result of unicode.istitle() for a Lexeme ID. |  | ||||||
| # |  | ||||||
| #        >>> is_title(lookup(u'Hi')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_title(lookup(u'Hi1')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_title(lookup(u'1')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_TITLE |  | ||||||
| # |  | ||||||
| #    cpdef bint is_ascii(self) except *: |  | ||||||
| #        """Give the result of checking whether all characters in the string are ascii. |  | ||||||
| # |  | ||||||
| #        >>> is_ascii(lookup(u'Hi')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_ascii(lookup(u' ')) |  | ||||||
| #        True |  | ||||||
| #        >>> is_title(lookup(u'<unicode>')) |  | ||||||
| #        False |  | ||||||
| #        """ |  | ||||||
| #        return self.orth_flags & 1 << IS_ASCII |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user