mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Refactoring with Lexeme as a class now compiles. Basic design seems to work
This commit is contained in:
		
							parent
							
								
									68bae2fec6
								
							
						
					
					
						commit
						e9a62b6eba
					
				
							
								
								
									
										14
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								spacy/en.pxd
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
from spacy.spacy cimport Language
 | 
			
		||||
from spacy.lang cimport Language
 | 
			
		||||
from spacy.word cimport Lexeme
 | 
			
		||||
cimport cython
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -31,12 +31,14 @@ cpdef size_t POS
 | 
			
		|||
cpdef size_t PRON
 | 
			
		||||
cpdef size_t PRT
 | 
			
		||||
 | 
			
		||||
cdef class English(spacy.Language):
 | 
			
		||||
    cdef int find_split(self, unicode word)
 | 
			
		||||
cpdef size_t SIC
 | 
			
		||||
cpdef size_t CANON_CASED
 | 
			
		||||
cpdef size_t SHAPE
 | 
			
		||||
cpdef size_t NON_SPARSE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef English EN
 | 
			
		||||
cdef class English(Language):
 | 
			
		||||
    cpdef int _split_one(self, unicode word)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Word lookup(unicode word)
 | 
			
		||||
cpdef list tokenize(unicode string)
 | 
			
		||||
cpdef English EN
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										189
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										189
									
								
								spacy/en.pyx
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -31,6 +31,7 @@ same scheme. Tokenization problems are a major cause of poor performance for
 | 
			
		|||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 | 
			
		||||
provides a fully Penn Treebank 3-compliant tokenizer.
 | 
			
		||||
'''
 | 
			
		||||
# TODO
 | 
			
		||||
#The script translate_treebank_tokenization can be used to transform a treebank's
 | 
			
		||||
#annotation to use one of the spacy tokenization schemes.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -40,90 +41,14 @@ from __future__ import unicode_literals
 | 
			
		|||
from libc.stdlib cimport malloc, calloc, free
 | 
			
		||||
from libc.stdint cimport uint64_t
 | 
			
		||||
 | 
			
		||||
cimport spacy
 | 
			
		||||
cimport lang
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Python-readable flag constants --- can't read an enum from Python
 | 
			
		||||
 | 
			
		||||
# Don't want to manually assign these numbers, or we'll insert one and have to
 | 
			
		||||
# change them all.
 | 
			
		||||
# Don't use "i", as we don't want it in the global scope!
 | 
			
		||||
cdef size_t __i = 0
 | 
			
		||||
 | 
			
		||||
ALPHA = __i; i += 1
 | 
			
		||||
DIGIT = __i; __i += 1
 | 
			
		||||
PUNCT = __i; __i += 1
 | 
			
		||||
SPACE = __i; __i += 1
 | 
			
		||||
LOWER = __i; __i += 1
 | 
			
		||||
UPPER = __i; __i += 1
 | 
			
		||||
TITLE = __i; __i += 1
 | 
			
		||||
ASCII = __i; __i += 1
 | 
			
		||||
 | 
			
		||||
OFT_LOWER = __i; __i += 1 
 | 
			
		||||
OFT_UPPER = __i; __i += 1
 | 
			
		||||
OFT_TITLE = __i; __i += 1
 | 
			
		||||
 | 
			
		||||
PUNCT = __i; __i += 1
 | 
			
		||||
CONJ = __i; __i += 1
 | 
			
		||||
NUM = __i; __i += 1
 | 
			
		||||
X = __i; __i += 1
 | 
			
		||||
DET = __i; __i += 1
 | 
			
		||||
ADP = __i; __i += 1
 | 
			
		||||
ADJ = __i; __i += 1
 | 
			
		||||
ADV = __i; __i += 1
 | 
			
		||||
VERB = __i; __i += 1
 | 
			
		||||
NOUN = __i; __i += 1
 | 
			
		||||
PDT = __i; __i += 1
 | 
			
		||||
POS = __i; __i += 1
 | 
			
		||||
PRON = __i; __i += 1
 | 
			
		||||
PRT = __i; __i += 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# These are for the string views
 | 
			
		||||
__i = 0
 | 
			
		||||
SIC = __i; __i += 1
 | 
			
		||||
CANON_CASED = __i; __i += 1
 | 
			
		||||
NON_SPARSE = __i; __i += 1
 | 
			
		||||
SHAPE = __i; __i += 1
 | 
			
		||||
NR_STRING_VIEWS = __i
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_string_views(unicode string, lexeme):
 | 
			
		||||
    views = ['' for _ in range(NR_STRING_VIEWS)]
 | 
			
		||||
    views[SIC] = string
 | 
			
		||||
    views[CANON_CASED] = canonicalize_case(string, lexeme)
 | 
			
		||||
    views[SHAPE] = get_string_shape(string)
 | 
			
		||||
    views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
 | 
			
		||||
                                       lexeme)
 | 
			
		||||
    return views
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_orth_flags(unicode string, flags_t flags)
 | 
			
		||||
    setters = [
 | 
			
		||||
        (ALPHA, is_alpha),
 | 
			
		||||
        (DIGIT, is_digit),
 | 
			
		||||
        (PUNCT, is_punct),
 | 
			
		||||
        (SPACE, is_space),
 | 
			
		||||
        (LOWER, is_lower),
 | 
			
		||||
        (UPPER, is_upper),
 | 
			
		||||
        (SPACE, is_space)
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    for bit, setter in setters:
 | 
			
		||||
        if setter(string):
 | 
			
		||||
            flags |= 1 << bit
 | 
			
		||||
    return flags
 | 
			
		||||
from spacy import orth
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class English(spacy.Language):
 | 
			
		||||
    cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
 | 
			
		||||
                           tag_freqs=None):
 | 
			
		||||
        return Lexeme(s, length, views, prob=prob, cluster=cluster,
 | 
			
		||||
                      flags=self.get_flags(string))
 | 
			
		||||
 | 
			
		||||
    cdef int find_split(self, unicode word):
 | 
			
		||||
cdef class English(Language):
 | 
			
		||||
    cpdef int _split_one(self, unicode word):
 | 
			
		||||
        cdef size_t length = len(word)
 | 
			
		||||
        cdef int i = 0
 | 
			
		||||
        if word.startswith("'s") or word.startswith("'S"):
 | 
			
		||||
| 
						 | 
				
			
			@ -132,17 +57,16 @@ cdef class English(spacy.Language):
 | 
			
		|||
        if word.endswith("'s") and length >= 3:
 | 
			
		||||
            return length - 2
 | 
			
		||||
        # Leading punctuation
 | 
			
		||||
        if check_punct(word, 0, length):
 | 
			
		||||
        if _check_punct(word, 0, length):
 | 
			
		||||
            return 1
 | 
			
		||||
        elif length >= 1:
 | 
			
		||||
            # Split off all trailing punctuation characters
 | 
			
		||||
            i = 0
 | 
			
		||||
            while i < length and not check_punct(word, i, length):
 | 
			
		||||
            while i < length and not _check_punct(word, i, length):
 | 
			
		||||
                i += 1
 | 
			
		||||
        return i
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef bint check_punct(unicode word, size_t i, size_t length):
 | 
			
		||||
cdef bint _check_punct(unicode word, size_t i, size_t length):
 | 
			
		||||
    # Don't count appostrophes as punct if the next char is a letter
 | 
			
		||||
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
 | 
			
		||||
        return i == 0
 | 
			
		||||
| 
						 | 
				
			
			@ -160,69 +84,46 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
 | 
			
		|||
EN = English('en')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef list tokenize(unicode string):
 | 
			
		||||
    """Tokenize a string.
 | 
			
		||||
 | 
			
		||||
    The tokenization rules are defined in two places:
 | 
			
		||||
 | 
			
		||||
    * The data/en/tokenization table, which handles special cases like contractions;
 | 
			
		||||
    * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        string (unicode): The string to be tokenized. 
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
 | 
			
		||||
    """
 | 
			
		||||
    return EN.tokenize(string)
 | 
			
		||||
# Thresholds for frequency related flags
 | 
			
		||||
TAG_THRESH = 0.5
 | 
			
		||||
LOWER_THRESH = 0.5
 | 
			
		||||
UPPER_THRESH = 0.3
 | 
			
		||||
TITLE_THRESH = 0.9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cpdef Lexeme lookup(unicode string):
 | 
			
		||||
    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
 | 
			
		||||
# Python-readable flag constants --- can't read an enum from Python
 | 
			
		||||
ALPHA = EN.lexicon.add_flag(orth.is_alpha)
 | 
			
		||||
DIGIT = EN.lexicon.add_flag(orth.is_digit)
 | 
			
		||||
PUNCT = EN.lexicon.add_flag(orth.is_punct)
 | 
			
		||||
SPACE = EN.lexicon.add_flag(orth.is_space)
 | 
			
		||||
PUNCT = EN.lexicon.add_flag(orth.is_punct)
 | 
			
		||||
ASCII = EN.lexicon.add_flag(orth.is_ascii)
 | 
			
		||||
TITLE = EN.lexicon.add_flag(orth.is_title)
 | 
			
		||||
LOWER = EN.lexicon.add_flag(orth.is_lower)
 | 
			
		||||
UPPER = EN.lexicon.add_flag(orth.is_upper)
 | 
			
		||||
 | 
			
		||||
    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
 | 
			
		||||
    Access is cheap/free, as the LexID is the memory address of the Lexeme.
 | 
			
		||||
OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
 | 
			
		||||
OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
 | 
			
		||||
OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        string (unicode):  The string to be looked up. Must be unicode, not bytes.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        lexeme (LexID): A reference to a lexical type.
 | 
			
		||||
    """
 | 
			
		||||
    return EN.lookup(string)
 | 
			
		||||
CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
 | 
			
		||||
CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
 | 
			
		||||
CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
 | 
			
		||||
CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
 | 
			
		||||
CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
 | 
			
		||||
CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
 | 
			
		||||
CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
 | 
			
		||||
CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
 | 
			
		||||
CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
 | 
			
		||||
CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
 | 
			
		||||
CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
 | 
			
		||||
CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
 | 
			
		||||
CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
 | 
			
		||||
CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_string_views(view_funcs):
 | 
			
		||||
    """Add a string view to existing and previous lexical entries.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        get_view (function): A unicode --> unicode function.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        view_id (int): An integer key you can use to access the view.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_clusters(location):
 | 
			
		||||
    """Load cluster data.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_unigram_probs(location):
 | 
			
		||||
    """Load unigram probabilities.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_case_stats(location):
 | 
			
		||||
    """Load case stats.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_tag_stats(location):
 | 
			
		||||
    """Load tag statistics.
 | 
			
		||||
    """
 | 
			
		||||
    pass
 | 
			
		||||
# These are the name of string transforms
 | 
			
		||||
SIC = EN.lexicon.add_transform(orth.sic_string)
 | 
			
		||||
CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
 | 
			
		||||
SHAPE = EN.lexicon.add_transform(orth.word_shape)
 | 
			
		||||
NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,18 +3,23 @@ from libc.stdint cimport uint64_t
 | 
			
		|||
from spacy.word cimport Lexeme
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class Lexicon:
 | 
			
		||||
    cdef public list flag_checkers
 | 
			
		||||
    cdef public list string_transformers
 | 
			
		||||
 | 
			
		||||
    cdef dict lexicon
 | 
			
		||||
 | 
			
		||||
    cpdef Lexeme lookup(self, unicode string)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class Language:
 | 
			
		||||
    cdef object name
 | 
			
		||||
    cdef dict blobs
 | 
			
		||||
    cdef dict lexicon
 | 
			
		||||
    cdef dict cache
 | 
			
		||||
    cpdef readonly Lexicon lexicon
 | 
			
		||||
 | 
			
		||||
    cpdef list tokenize(self, unicode text)
 | 
			
		||||
 | 
			
		||||
    cdef Word lookup(self, unicode string)
 | 
			
		||||
    cdef list lookup_chunk(self, unicode chunk)
 | 
			
		||||
    cdef list _tokenize(self, unicode string)
 | 
			
		||||
    cpdef list _split(self, unicode string)
 | 
			
		||||
    cpdef int _split_one(self, unicode word)
 | 
			
		||||
    
 | 
			
		||||
    cdef list new_chunk(self, unicode string, list substrings)
 | 
			
		||||
    cdef Word new_lexeme(self, unicode lex)
 | 
			
		||||
    
 | 
			
		||||
    cpdef list find_substrings(self, unicode chunk)
 | 
			
		||||
    cdef int find_split(self, unicode word)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										206
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							
							
						
						
									
										206
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -6,37 +6,37 @@ Provides the main implementation for the spacy tokenizer. Specific languages
 | 
			
		|||
subclass the Language class, over-writing the tokenization rules as necessary.
 | 
			
		||||
Special-case tokenization rules are read from data/<lang>/tokenization .
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from libc.stdlib cimport calloc, free
 | 
			
		||||
 | 
			
		||||
from . import util
 | 
			
		||||
import json
 | 
			
		||||
from os import path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class Language:
 | 
			
		||||
    view_funcs = []
 | 
			
		||||
    def __cinit__(self, name):
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.blobs = {}
 | 
			
		||||
        self.lexicon = {}
 | 
			
		||||
        self.cache = {}
 | 
			
		||||
        self.lexicon = Lexicon()
 | 
			
		||||
        self.load_tokenization(util.read_tokenization(name))
 | 
			
		||||
        self.load_dist_info(util.read_dist_info(name))
 | 
			
		||||
 | 
			
		||||
    cpdef list tokenize(self, unicode string):
 | 
			
		||||
        """Tokenize.
 | 
			
		||||
        """Tokenize a string.
 | 
			
		||||
 | 
			
		||||
        Split the string into tokens.
 | 
			
		||||
        The tokenization rules are defined in two places:
 | 
			
		||||
 | 
			
		||||
        * The data/<lang>/tokenization table, which handles special cases like contractions;
 | 
			
		||||
        * The appropriate :py:meth:`find_split` function, which is used to split
 | 
			
		||||
          off punctuation etc.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            string (unicode): The string to split.
 | 
			
		||||
            string (unicode): The string to be tokenized. 
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            tokens (list): A list of Lexeme objects.
 | 
			
		||||
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
 | 
			
		||||
        """
 | 
			
		||||
        cdef list blob
 | 
			
		||||
        cdef list tokens = []
 | 
			
		||||
        cdef size_t length = len(string)
 | 
			
		||||
        cdef size_t start = 0
 | 
			
		||||
| 
						 | 
				
			
			@ -44,74 +44,28 @@ cdef class Language:
 | 
			
		|||
        for c in string:
 | 
			
		||||
            if c == ' ':
 | 
			
		||||
                if start < i:
 | 
			
		||||
                    blob = self.lookup_blob(string[start:i])
 | 
			
		||||
                    tokens.extend(blob)
 | 
			
		||||
                    tokens.extend(self._tokenize(string[start:i]))
 | 
			
		||||
                start = i + 1
 | 
			
		||||
            i += 1
 | 
			
		||||
        if start < i:
 | 
			
		||||
            chunk = self.lookup_blob(string[start:])
 | 
			
		||||
            tokens.extend(chunk)
 | 
			
		||||
            tokens.extend(self._tokenize(string[start:]))
 | 
			
		||||
        return tokens
 | 
			
		||||
 | 
			
		||||
    cdef Lexeme lookup(self, unicode string):
 | 
			
		||||
        assert len(string) != 0
 | 
			
		||||
        cdef Word word 
 | 
			
		||||
        if string in self.vocab:
 | 
			
		||||
            word = self.vocab[string]
 | 
			
		||||
        else:
 | 
			
		||||
            word = self.new_lexeme(string)
 | 
			
		||||
        return word
 | 
			
		||||
 | 
			
		||||
    cdef list lookup_blob(self, unicode string):
 | 
			
		||||
        cdef list chunk
 | 
			
		||||
        cdef size_t blob_id
 | 
			
		||||
        if string in self.blobs:
 | 
			
		||||
            blob = self.blobs[string]
 | 
			
		||||
        else:
 | 
			
		||||
            blob = self.new_blob(string, self.find_substrings(string))
 | 
			
		||||
        return chunk
 | 
			
		||||
 | 
			
		||||
    cdef list new_blob(self, unicode string, list substrings):
 | 
			
		||||
        blob = []
 | 
			
		||||
    cdef list _tokenize(self, unicode string):
 | 
			
		||||
        if string in self.cache:
 | 
			
		||||
            return self.cache[string]
 | 
			
		||||
        cdef list lexemes = []
 | 
			
		||||
        substrings = self._split(string)
 | 
			
		||||
        for i, substring in enumerate(substrings):
 | 
			
		||||
            blob.append(self.lookup(substring))
 | 
			
		||||
        self.blobs[string] = chunk
 | 
			
		||||
        return blob
 | 
			
		||||
            lexemes.append(self.lookup(substring))
 | 
			
		||||
        self.cache[string] = lexemes
 | 
			
		||||
        return lexemes
 | 
			
		||||
 | 
			
		||||
    cdef Word new_lexeme(self, unicode string):
 | 
			
		||||
        # TODO
 | 
			
		||||
        #lexeme = Lexeme(string.encode('utf8'), string_views)
 | 
			
		||||
        #return lexeme
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
    def add_view_funcs(self, list view_funcs):
 | 
			
		||||
        self.view_funcs.extend(view_funcs)
 | 
			
		||||
        cdef size_t nr_views = len(self.view_funcs)
 | 
			
		||||
 | 
			
		||||
        cdef unicode view
 | 
			
		||||
        cdef StringHash hashed
 | 
			
		||||
        cdef StringHash key
 | 
			
		||||
        cdef unicode string
 | 
			
		||||
        cdef LexID lex_id
 | 
			
		||||
        cdef Lexeme* word
 | 
			
		||||
 | 
			
		||||
        for key, lex_id in self.vocab.items():
 | 
			
		||||
            word = <Lexeme*>lex_id
 | 
			
		||||
            free(word.string_views)
 | 
			
		||||
            word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
 | 
			
		||||
            string = word.string[:word.length].decode('utf8')
 | 
			
		||||
            for i, view_func in enumerate(self.view_funcs):
 | 
			
		||||
                view = view_func(string)
 | 
			
		||||
                hashed = hash(view)
 | 
			
		||||
                word.string_views[i] = hashed
 | 
			
		||||
                self.bacov[hashed] = view
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    cpdef list find_substrings(self, unicode blob):
 | 
			
		||||
        """Find how to split a chunk into substrings.
 | 
			
		||||
    cpdef list _split(self, unicode string):
 | 
			
		||||
        """Find how to split a contiguous span of non-space characters into substrings.
 | 
			
		||||
 | 
			
		||||
        This method calls find_split repeatedly. Most languages will want to
 | 
			
		||||
        override find_split, but it may be useful to override this instead.
 | 
			
		||||
        override _split_one, but it may be useful to override this instead.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            chunk (unicode): The string to be split, e.g. u"Mike's!"
 | 
			
		||||
| 
						 | 
				
			
			@ -120,22 +74,22 @@ cdef class Language:
 | 
			
		|||
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
 | 
			
		||||
        """
 | 
			
		||||
        substrings = []
 | 
			
		||||
        while blob:
 | 
			
		||||
            split = self.find_split(blob)
 | 
			
		||||
        while string:
 | 
			
		||||
            split = self._split_one(string)
 | 
			
		||||
            if split == 0:
 | 
			
		||||
                substrings.append(blob)
 | 
			
		||||
                substrings.append(string)
 | 
			
		||||
                break
 | 
			
		||||
            substrings.append(blob[:split])
 | 
			
		||||
            blob = blob[split:]
 | 
			
		||||
            substrings.append(string[:split])
 | 
			
		||||
            string = string[split:]
 | 
			
		||||
        return substrings
 | 
			
		||||
 | 
			
		||||
    cdef int find_split(self, unicode word):
 | 
			
		||||
    cpdef int _split_one(self, unicode word):
 | 
			
		||||
        return len(word)
 | 
			
		||||
 | 
			
		||||
    def load_tokenization(self, token_rules):
 | 
			
		||||
    def load_special_tokenization(self, token_rules):
 | 
			
		||||
        '''Load special-case tokenization rules.
 | 
			
		||||
 | 
			
		||||
        Loads special-case tokenization rules into the Language.chunk cache,
 | 
			
		||||
        Loads special-case tokenization rules into the Language.cache cache,
 | 
			
		||||
        read from data/<lang>/tokenization . The special cases are loaded before
 | 
			
		||||
        any language data is tokenized, giving these priority.  For instance,
 | 
			
		||||
        the English tokenization rules map "ain't" to ["are", "not"].
 | 
			
		||||
| 
						 | 
				
			
			@ -144,25 +98,83 @@ cdef class Language:
 | 
			
		|||
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
 | 
			
		||||
                a string and tokens is a list of strings.
 | 
			
		||||
        '''
 | 
			
		||||
        for chunk, tokens in token_rules:
 | 
			
		||||
            self.new_chunk(chunk, tokens)
 | 
			
		||||
        for string, substrings in token_rules:
 | 
			
		||||
            lexemes = []
 | 
			
		||||
            for i, substring in enumerate(substrings):
 | 
			
		||||
                lexemes.append(self.lookup(substring))
 | 
			
		||||
            self.cache[string] = lexemes
 | 
			
		||||
 
 | 
			
		||||
    def load_dist_info(self, dist_info):
 | 
			
		||||
        '''Load distributional information for the known lexemes of the language.
 | 
			
		||||
 | 
			
		||||
        The distributional information is read from data/<lang>/dist_info.json .
 | 
			
		||||
        It contains information like the (smoothed) unigram log probability of
 | 
			
		||||
        the word, how often the word is found upper-cased, how often the word
 | 
			
		||||
        is found title-cased, etc.
 | 
			
		||||
        '''
 | 
			
		||||
cdef class Lexicon:
 | 
			
		||||
    def __cinit__(self):
 | 
			
		||||
        self.flag_checkers = []
 | 
			
		||||
        self.string_transforms = []
 | 
			
		||||
        self.lexicon = {}
 | 
			
		||||
 | 
			
		||||
    cpdef Lexeme lookup(self, unicode string):
 | 
			
		||||
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
 | 
			
		||||
    
 | 
			
		||||
        Args:
 | 
			
		||||
            string (unicode):  The string to be looked up. Must be unicode, not bytes.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            lexeme (Lexeme): A reference to a lexical type.
 | 
			
		||||
        """
 | 
			
		||||
        assert len(string) != 0
 | 
			
		||||
        if string in self.lexicon:
 | 
			
		||||
            return self.lexicon[string]
 | 
			
		||||
        
 | 
			
		||||
        prob = _pop_default(self.probs, string, 0.0)
 | 
			
		||||
        cluster = _pop_default(self.clusters, string, 0.0)
 | 
			
		||||
        case_stats = _pop_default(self.case_stats, string, {})
 | 
			
		||||
        tag_stats = _pop_default(self.tag_stats, string, {})
 | 
			
		||||
 | 
			
		||||
        cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
 | 
			
		||||
                                  self.flag_checkers, self.string_transformers)
 | 
			
		||||
        self.lexicon[string] = word
 | 
			
		||||
        return word
 | 
			
		||||
 | 
			
		||||
    def add_flag(self, flag_checker):
 | 
			
		||||
        cdef unicode string
 | 
			
		||||
        cdef dict word_dist
 | 
			
		||||
        cdef Word w
 | 
			
		||||
        for string, word_dist in dist_info.items():
 | 
			
		||||
            w = self.lookup(string)
 | 
			
		||||
            w.prob = word_dist.prob
 | 
			
		||||
            w.cluster = word_dist.cluster
 | 
			
		||||
            for flag in word_dist.flags:
 | 
			
		||||
                w.dist_flags |= DIST_FLAGS[flag]
 | 
			
		||||
            for tag in word_dist.tagdict:
 | 
			
		||||
                w.possible_tags |= TAGS[tag]
 | 
			
		||||
        cdef Lexeme word
 | 
			
		||||
        flag_id = len(self.flag_checkers)
 | 
			
		||||
        for string, word in self.lexicon.items():
 | 
			
		||||
            if flag_checker(string, word.prob, {}):
 | 
			
		||||
                word.set_flag(flag_id)
 | 
			
		||||
        self.flag_checkers.append(flag_checker)
 | 
			
		||||
        return flag_id
 | 
			
		||||
 | 
			
		||||
    def add_transform(self, string_transform):
 | 
			
		||||
        self.string_transformers.append(string_transform)
 | 
			
		||||
        return len(self.string_transformers) - 1
 | 
			
		||||
 | 
			
		||||
    def load_probs(self, location):
 | 
			
		||||
        """Load unigram probabilities.
 | 
			
		||||
        """
 | 
			
		||||
        self.probs = json.load(location)
 | 
			
		||||
        
 | 
			
		||||
        cdef Lexeme word
 | 
			
		||||
        cdef unicode string
 | 
			
		||||
 | 
			
		||||
        for string, word in self.lexicon.items():
 | 
			
		||||
            prob = _pop_default(self.probs, string, 0.0)
 | 
			
		||||
            word.prob = prob
 | 
			
		||||
 | 
			
		||||
    def load_clusters(self, location):
 | 
			
		||||
        self.probs = json.load(location)
 | 
			
		||||
        
 | 
			
		||||
        cdef Lexeme word
 | 
			
		||||
        cdef unicode string
 | 
			
		||||
 | 
			
		||||
        for string, word in self.lexicon.items():
 | 
			
		||||
            cluster = _pop_default(self.cluster, string, 0)
 | 
			
		||||
            word.cluster = cluster
 | 
			
		||||
 | 
			
		||||
    def load_stats(self, location):
 | 
			
		||||
        """Load distributional stats.
 | 
			
		||||
        """
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _pop_default(dict d, key, default):
 | 
			
		||||
    return d.pop(key) if key in d else default
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,54 +0,0 @@
 | 
			
		|||
import os
 | 
			
		||||
from os import path
 | 
			
		||||
import codecs
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def utf8open(loc, mode='r'):
 | 
			
		||||
    return codecs.open(loc, mode, 'utf8')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_case_stats(data_dir):
 | 
			
		||||
    case_loc = path.join(data_dir, 'case')
 | 
			
		||||
    case_stats = {}
 | 
			
		||||
    with utf8open(case_loc) as cases_file:
 | 
			
		||||
        for line in cases_file:
 | 
			
		||||
            word, upper, title = line.split()
 | 
			
		||||
            case_stats[word] = (float(upper), float(title))
 | 
			
		||||
    return case_stats
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_dist_info(lang):
 | 
			
		||||
    dist_path = path.join(DATA_DIR, lang, 'distribution_info.json')
 | 
			
		||||
    if path.exists(dist_path):
 | 
			
		||||
        with open(dist_path) as file_:
 | 
			
		||||
            dist_info = json.load(file_)
 | 
			
		||||
    else:
 | 
			
		||||
        dist_info = {}
 | 
			
		||||
    return dist_info
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_tokenization(lang):
 | 
			
		||||
    loc = path.join(DATA_DIR, lang, 'tokenization')
 | 
			
		||||
    entries = []
 | 
			
		||||
    seen = set()
 | 
			
		||||
    with utf8open(loc) as file_:
 | 
			
		||||
        for line in file_:
 | 
			
		||||
            line = line.strip()
 | 
			
		||||
            if line.startswith('#'):
 | 
			
		||||
                continue
 | 
			
		||||
            if not line:
 | 
			
		||||
                continue
 | 
			
		||||
            pieces = line.split()
 | 
			
		||||
            chunk = pieces.pop(0)
 | 
			
		||||
            assert chunk not in seen, chunk
 | 
			
		||||
            seen.add(chunk)
 | 
			
		||||
            entries.append((chunk, list(pieces)))
 | 
			
		||||
            if chunk[0].isalpha() and chunk[0].islower():
 | 
			
		||||
                chunk = chunk[0].title() + chunk[1:]
 | 
			
		||||
                pieces[0] = pieces[0][0].title() + pieces[0][1:]
 | 
			
		||||
                seen.add(chunk)
 | 
			
		||||
                entries.append((chunk, pieces))
 | 
			
		||||
    return entries
 | 
			
		||||
| 
						 | 
				
			
			@ -7,19 +7,19 @@ DEF MAX_FLAG = 64
 | 
			
		|||
cdef class Lexeme:
 | 
			
		||||
    # NB: the readonly keyword refers to _Python_ access. The attributes are
 | 
			
		||||
    # writeable from Cython.
 | 
			
		||||
    cdef readonly id_t id
 | 
			
		||||
    cdef readonly size_t length
 | 
			
		||||
    cdef readonly double prob
 | 
			
		||||
    cdef readonly size_t cluster
 | 
			
		||||
    cpdef readonly id_t id
 | 
			
		||||
    cpdef readonly size_t length
 | 
			
		||||
    cpdef readonly double prob
 | 
			
		||||
    cpdef readonly size_t cluster
 | 
			
		||||
 | 
			
		||||
    cdef readonly utf8_t* strings
 | 
			
		||||
    cdef readonly size_t nr_strings
 | 
			
		||||
    cdef utf8_t* views
 | 
			
		||||
    cdef size_t nr_views
 | 
			
		||||
 | 
			
		||||
    cdef readonly flag_t flags
 | 
			
		||||
 | 
			
		||||
    cpdef bint check_flag(self, size_t flag_id) except *
 | 
			
		||||
    cpdef int set_flag(self, size_t flag_id) except -1
 | 
			
		||||
    
 | 
			
		||||
    cpdef unicode get_string(self, size_t i) except *
 | 
			
		||||
    cpdef id_t get_id(self, size_t i) except 0
 | 
			
		||||
    cpdef int add_strings(self, list strings) except -1
 | 
			
		||||
    cpdef unicode get_view_string(self, size_t i)
 | 
			
		||||
    cpdef id_t get_view_id(self, size_t i) except 0
 | 
			
		||||
    cpdef int add_view(self, unicode view) except -1
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										176
									
								
								spacy/word.pyx
									
									
									
									
									
								
							
							
						
						
									
										176
									
								
								spacy/word.pyx
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -2,10 +2,7 @@
 | 
			
		|||
# cython: embedsignature=True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from libc.stdlib cimport calloc, free
 | 
			
		||||
 | 
			
		||||
from spacy cimport flags
 | 
			
		||||
 | 
			
		||||
from libc.stdlib cimport calloc, free, realloc
 | 
			
		||||
 | 
			
		||||
cdef class Lexeme:
 | 
			
		||||
    """A lexical type.
 | 
			
		||||
| 
						 | 
				
			
			@ -53,7 +50,7 @@ cdef class Lexeme:
 | 
			
		|||
            the same cluster ID as "pineapple", which is not what we'd like.
 | 
			
		||||
    """
 | 
			
		||||
    def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
 | 
			
		||||
                  cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
 | 
			
		||||
                  flags=0):
 | 
			
		||||
        self.id = <id_t>&string
 | 
			
		||||
        self.length = length
 | 
			
		||||
        self.nr_strings = 0
 | 
			
		||||
| 
						 | 
				
			
			@ -66,25 +63,21 @@ cdef class Lexeme:
 | 
			
		|||
        def __get__(self):
 | 
			
		||||
            return self.strings[0].decode('utf8')
 | 
			
		||||
 | 
			
		||||
    cpdef unicode get_view_string(self, size_t i) except *:
 | 
			
		||||
    cpdef unicode get_view_string(self, size_t i):
 | 
			
		||||
        assert i < self.nr_strings
 | 
			
		||||
        return self.strings[i].decode('utf8')
 | 
			
		||||
 | 
			
		||||
    cpdef intptr_t get_view_id(self, size_t i) except 0:
 | 
			
		||||
    cpdef id_t get_view_id(self, size_t i) except 0:
 | 
			
		||||
        assert i < self.nr_strings
 | 
			
		||||
        return <string_id_t>&self.views[i]
 | 
			
		||||
        return <id_t>&self.views[i]
 | 
			
		||||
 | 
			
		||||
    cpdef int add_views(self, list views) except -1:
 | 
			
		||||
        self.nr_views += len(strings)
 | 
			
		||||
    cpdef int add_view(self, unicode view) except -1:
 | 
			
		||||
        self.nr_views += 1
 | 
			
		||||
        self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
 | 
			
		||||
        cdef unicode view
 | 
			
		||||
        cdef bytes utf8_string
 | 
			
		||||
        for i, view in enumerate(strings):
 | 
			
		||||
            view = string_views[i]
 | 
			
		||||
            utf8_string = view.encode('utf8')
 | 
			
		||||
            # Intern strings, allowing pointer comparison
 | 
			
		||||
            utf8_string = intern(utf8_string)
 | 
			
		||||
            self.views[i] = utf8_string
 | 
			
		||||
        cdef bytes utf8_string = view.encode('utf8')
 | 
			
		||||
        # Intern strings, allowing pointer comparison
 | 
			
		||||
        utf8_string = intern(utf8_string)
 | 
			
		||||
        self.views[self.nr_views - 1] = utf8_string
 | 
			
		||||
 | 
			
		||||
    cpdef bint check_flag(self, size_t flag_id) except *:
 | 
			
		||||
        """Access the value of one of the pre-computed boolean distribution features.
 | 
			
		||||
| 
						 | 
				
			
			@ -92,154 +85,7 @@ cdef class Lexeme:
 | 
			
		|||
        Meanings depend on the language-specific distributional features being loaded.
 | 
			
		||||
        The suggested features for latin-alphabet languages are: TODO
 | 
			
		||||
        """
 | 
			
		||||
        assert flag_id < flags.MAX_FLAG
 | 
			
		||||
        return self.flags & (1 << flag_id)
 | 
			
		||||
 | 
			
		||||
    cpdef int set_flag(self, size_t flag_id) except -1:
 | 
			
		||||
        assert flag_id < flags.MAX_FLAG
 | 
			
		||||
        self.flags |= (1 << flag_id)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
#cdef class CasedWord(Word):
 | 
			
		||||
#    def __cinit__(self, bytes string, list views):
 | 
			
		||||
#        Word.__cinit__(self, string, string_views)
 | 
			
		||||
#    
 | 
			
		||||
#    cpdef bint is_often_uppered(self) except *:
 | 
			
		||||
#        '''Check the OFT_UPPER distributional flag for the word.
 | 
			
		||||
#    
 | 
			
		||||
#        The OFT_UPPER flag records whether a lower-cased version of the word
 | 
			
		||||
#        is found in all-upper case frequently in a large sample of text, where
 | 
			
		||||
#        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
 | 
			
		||||
#        POS tagging).
 | 
			
		||||
#    
 | 
			
		||||
#        Case statistics are estimated from a large text corpus. Estimates are read
 | 
			
		||||
#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
 | 
			
		||||
#    
 | 
			
		||||
#        >>> is_often_uppered(lookup(u'nato'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_often_uppered(lookup(u'the')) 
 | 
			
		||||
#        False
 | 
			
		||||
#        '''
 | 
			
		||||
#        return self.dist_flags & (1 << OFT_UPPER)
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_often_titled(self) except *:
 | 
			
		||||
#        '''Check the OFT_TITLE distributional flag for the word.
 | 
			
		||||
#    
 | 
			
		||||
#        The OFT_TITLE flag records whether a lower-cased version of the word
 | 
			
		||||
#        is found title-cased (see string.istitle) frequently in a large sample of text,
 | 
			
		||||
#        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
 | 
			
		||||
#        POS tagging).
 | 
			
		||||
#    
 | 
			
		||||
#        Case statistics are estimated from a large text corpus. Estimates are read
 | 
			
		||||
#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
 | 
			
		||||
#    
 | 
			
		||||
#        >>> is_oft_upper(lookup(u'john'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_oft_upper(lookup(u'Bill')) 
 | 
			
		||||
#        False
 | 
			
		||||
#        '''
 | 
			
		||||
#        return self.dist_flags & (1 << OFT_TITLE)
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_alpha(self) except *:
 | 
			
		||||
#        """Check whether all characters in the word's string are alphabetic.
 | 
			
		||||
#        
 | 
			
		||||
#        Should match the :py:func:`unicode.isalpha()` function.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_alpha(lookup(u'Hello'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_alpha(lookup(u'العرب'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_alpha(lookup(u'10'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_ALPHA
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_digit(self) except *:
 | 
			
		||||
#        """Check whether all characters in the word's string are numeric.
 | 
			
		||||
#    
 | 
			
		||||
#        Should match the :py:func:`unicode.isdigit()` function.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_digit(lookup(u'10'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_digit(lookup(u'๐'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_digit(lookup(u'one'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_DIGIT
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_punct(self) except *:
 | 
			
		||||
#        """Check whether all characters belong to a punctuation unicode data category
 | 
			
		||||
#        for a Lexeme ID.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_punct(lookup(u'.'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_punct(lookup(u'⁒'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_punct(lookup(u' '))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_PUNCT
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_space(self) except *:
 | 
			
		||||
#        """Give the result of unicode.isspace() for a Lexeme ID.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_space(lookup(u'\\t'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_space(lookup(u'<unicode space>'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_space(lookup(u'Hi\\n'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_SPACE
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_lower(self) except *:
 | 
			
		||||
#        """Give the result of unicode.islower() for a Lexeme ID.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_lower(lookup(u'hi'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_lower(lookup(<unicode>))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_lower(lookup(u'10'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_LOWER
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_upper(self) except *:
 | 
			
		||||
#        """Give the result of unicode.isupper() for a Lexeme ID.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_upper(lookup(u'HI'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_upper(lookup(u'H10'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_upper(lookup(u'10'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_UPPER
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_title(self) except *:
 | 
			
		||||
#        """Give the result of unicode.istitle() for a Lexeme ID.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_title(lookup(u'Hi'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_title(lookup(u'Hi1'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_title(lookup(u'1'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_TITLE
 | 
			
		||||
#
 | 
			
		||||
#    cpdef bint is_ascii(self) except *:
 | 
			
		||||
#        """Give the result of checking whether all characters in the string are ascii.
 | 
			
		||||
#
 | 
			
		||||
#        >>> is_ascii(lookup(u'Hi'))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_ascii(lookup(u' '))
 | 
			
		||||
#        True
 | 
			
		||||
#        >>> is_title(lookup(u'<unicode>'))
 | 
			
		||||
#        False
 | 
			
		||||
#        """
 | 
			
		||||
#        return self.orth_flags & 1 << IS_ASCII
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user