mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Make PyPy work
This commit is contained in:
		
							parent
							
								
									1dd663ea03
								
							
						
					
					
						commit
						3f1944d688
					
				| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
from __future__ import division
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,19 @@
 | 
			
		|||
# cython: embedsignature=True
 | 
			
		||||
from .. import orth
 | 
			
		||||
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
 | 
			
		||||
from ..orth cimport is_title, is_upper, like_url, like_number
 | 
			
		||||
from ..typedefs cimport flags_t
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_flags(unicode string):
 | 
			
		||||
    cdef flags_t flags = 0
 | 
			
		||||
    flags |= orth.is_alpha(string) << IS_ALPHA
 | 
			
		||||
    flags |= orth.is_ascii(string) << IS_ASCII
 | 
			
		||||
    flags |= orth.is_digit(string) << IS_DIGIT
 | 
			
		||||
    flags |= orth.is_lower(string) << IS_LOWER
 | 
			
		||||
    flags |= orth.is_punct(string) << IS_PUNCT
 | 
			
		||||
    flags |= orth.is_space(string) << IS_SPACE
 | 
			
		||||
    flags |= orth.is_title(string) << IS_TITLE
 | 
			
		||||
    flags |= orth.is_upper(string) << IS_UPPER
 | 
			
		||||
    flags |= orth.like_url(string) << LIKE_URL
 | 
			
		||||
    flags |= orth.like_number(string) << LIKE_NUM
 | 
			
		||||
    flags |= is_alpha(string) << IS_ALPHA
 | 
			
		||||
    flags |= is_ascii(string) << IS_ASCII
 | 
			
		||||
    flags |= is_digit(string) << IS_DIGIT
 | 
			
		||||
    flags |= is_lower(string) << IS_LOWER
 | 
			
		||||
    flags |= is_punct(string) << IS_PUNCT
 | 
			
		||||
    flags |= is_space(string) << IS_SPACE
 | 
			
		||||
    flags |= is_title(string) << IS_TITLE
 | 
			
		||||
    flags |= is_upper(string) << IS_UPPER
 | 
			
		||||
    flags |= like_url(string) << LIKE_URL
 | 
			
		||||
    flags |= like_number(string) << LIKE_NUM
 | 
			
		||||
    return flags
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
from os import path
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
# cython: embedsignature=True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										12
									
								
								spacy/orth.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/orth.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,12 @@
 | 
			
		|||
cpdef bint is_alpha(unicode string)
 | 
			
		||||
cpdef bint is_digit(unicode string)
 | 
			
		||||
cpdef bint is_punct(unicode string)
 | 
			
		||||
cpdef bint is_space(unicode string)
 | 
			
		||||
cpdef bint is_ascii(unicode string)
 | 
			
		||||
cpdef bint is_title(unicode string)
 | 
			
		||||
cpdef bint is_lower(unicode string)
 | 
			
		||||
cpdef bint is_upper(unicode string)
 | 
			
		||||
cpdef bint like_url(unicode string)
 | 
			
		||||
cpdef bint like_number(unicode string)
 | 
			
		||||
cpdef unicode word_shape(unicode string)
 | 
			
		||||
cpdef bytes asciied(unicode string)
 | 
			
		||||
| 
						 | 
				
			
			@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
# Binary string features
 | 
			
		||||
def is_alpha(string):
 | 
			
		||||
cpdef bint is_alpha(unicode string):
 | 
			
		||||
    return string.isalpha()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_digit(string):
 | 
			
		||||
cpdef bint is_digit(unicode string):
 | 
			
		||||
    return string.isdigit()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_punct(string):
 | 
			
		||||
cpdef bint is_punct(unicode string):
 | 
			
		||||
    for c in string:
 | 
			
		||||
        if not unicodedata.category(c).startswith('P'):
 | 
			
		||||
            return False
 | 
			
		||||
| 
						 | 
				
			
			@ -27,11 +27,11 @@ def is_punct(string):
 | 
			
		|||
        return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_space(string):
 | 
			
		||||
cpdef bint is_space(unicode string):
 | 
			
		||||
    return string.isspace()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_ascii(string):
 | 
			
		||||
cpdef bint is_ascii(unicode string):
 | 
			
		||||
    for c in string:
 | 
			
		||||
        if ord(c) >= 128:
 | 
			
		||||
            return False
 | 
			
		||||
| 
						 | 
				
			
			@ -39,15 +39,15 @@ def is_ascii(string):
 | 
			
		|||
        return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_title(string):
 | 
			
		||||
cpdef bint is_title(unicode string):
 | 
			
		||||
    return string.istitle()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_lower(string):
 | 
			
		||||
cpdef bint is_lower(unicode string):
 | 
			
		||||
    return string.islower()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_upper(string):
 | 
			
		||||
cpdef bint is_upper(unicode string):
 | 
			
		||||
    return string.isupper()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
 | 
			
		|||
        "wf|ws|ye|yt|za|zm|zw".split('|'))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_url(string):
 | 
			
		||||
cpdef bint like_url(unicode string):
 | 
			
		||||
    # We're looking for things that function in text like URLs. So, valid URL
 | 
			
		||||
    # or not, anything they say http:// is going to be good.
 | 
			
		||||
    if string.startswith('http://'):
 | 
			
		||||
| 
						 | 
				
			
			@ -74,7 +74,14 @@ def like_url(string):
 | 
			
		|||
    elif string.startswith('www.') and len(string) >= 5:
 | 
			
		||||
        return True
 | 
			
		||||
    # No dots? Not URLish enough
 | 
			
		||||
    if string[0] == '.' or string[-1] == '.' or '.' not in string:
 | 
			
		||||
    if string[0] == '.' or string[-1] == '.':
 | 
			
		||||
        return False
 | 
			
		||||
    # This should be a call to "in", but PyPy lacks this function?
 | 
			
		||||
    cdef int i
 | 
			
		||||
    for i in range(len(string)):
 | 
			
		||||
        if string[i] == '.':
 | 
			
		||||
            break
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
    tld = string.rsplit('.', 1)[1].split(':', 1)[0]
 | 
			
		||||
    if tld.endswith('/'):
 | 
			
		||||
| 
						 | 
				
			
			@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
 | 
			
		|||
                'eighteen nineteen twenty thirty forty fifty sixty seventy'
 | 
			
		||||
                'eighty ninety hundred thousand million billion trillion'
 | 
			
		||||
                'quadrillion gajillion bazillion'.split())
 | 
			
		||||
def like_number(string):
 | 
			
		||||
cpdef bint like_number(unicode string):
 | 
			
		||||
    string = string.replace(',', '')
 | 
			
		||||
    string = string.replace('.', '')
 | 
			
		||||
    if string.isdigit():
 | 
			
		||||
| 
						 | 
				
			
			@ -103,30 +110,8 @@ def like_number(string):
 | 
			
		|||
        return True
 | 
			
		||||
    return False
 | 
			
		||||
 | 
			
		||||
# Statistics features
 | 
			
		||||
def oft_case(name, thresh):
 | 
			
		||||
    def wrapped(string, prob, case_stats, tag_stats):
 | 
			
		||||
        return string
 | 
			
		||||
    return wrapped
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def can_tag(name, thresh=0.5):
 | 
			
		||||
    def wrapped(string, prob, case_stats, tag_stats):
 | 
			
		||||
        return string
 | 
			
		||||
    return wrapped
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# String features
 | 
			
		||||
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
 | 
			
		||||
    if upper_pc >= lower_pc and upper_pc >= title_pc:
 | 
			
		||||
        return string.upper()
 | 
			
		||||
    elif title_pc >= lower_pc:
 | 
			
		||||
        return string.title()
 | 
			
		||||
    else:
 | 
			
		||||
        return string.lower()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def word_shape(string):
 | 
			
		||||
cpdef unicode word_shape(unicode string):
 | 
			
		||||
    length = len(string)
 | 
			
		||||
    shape = []
 | 
			
		||||
    last = ""
 | 
			
		||||
| 
						 | 
				
			
			@ -152,7 +137,7 @@ def word_shape(string):
 | 
			
		|||
    return ''.join(shape)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def asciied(string):
 | 
			
		||||
cpdef bytes asciied(unicode string):
 | 
			
		||||
    ascii_string = unidecode(string)
 | 
			
		||||
    if not ascii_string:
 | 
			
		||||
        return b'???'
 | 
			
		||||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
"""
 | 
			
		||||
Fill an array, context, with every _atomic_ value our features reference.
 | 
			
		||||
We then write the _actual features_ as tuples of the atoms. The machinery
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
from libc.string cimport memmove
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
from ._state cimport State
 | 
			
		||||
from ._state cimport has_head, get_idx, get_s0, get_n0
 | 
			
		||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
"""
 | 
			
		||||
MALT-style dependency parser
 | 
			
		||||
"""
 | 
			
		||||
| 
						 | 
				
			
			@ -54,7 +53,7 @@ def get_templates(name):
 | 
			
		|||
    if name == 'zhang':
 | 
			
		||||
        return pf.unigrams, pf.arc_eager
 | 
			
		||||
    else:
 | 
			
		||||
        return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
 | 
			
		||||
        return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
 | 
			
		||||
                             pf.tree_shape + pf.trigrams)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -64,7 +63,8 @@ cdef class GreedyParser:
 | 
			
		|||
        self.cfg = Config.read(model_dir, 'config')
 | 
			
		||||
        self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
 | 
			
		||||
        hasty_templ, full_templ = get_templates(self.cfg.features)
 | 
			
		||||
        self.model = Model(self.moves.n_moves, full_templ, model_dir)
 | 
			
		||||
        self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
 | 
			
		||||
                                model_dir)
 | 
			
		||||
 | 
			
		||||
    cpdef int parse(self, Tokens tokens) except -1:
 | 
			
		||||
        cdef:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
# cython: embedsignature=True
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,6 @@
 | 
			
		|||
from libc.stdint cimport uint32_t
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
from cython.view cimport array as cvarray
 | 
			
		||||
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
from thinc.typedefs cimport atom_t
 | 
			
		||||
| 
						 | 
				
			
			@ -39,7 +38,7 @@ cdef class Tokens:
 | 
			
		|||
 | 
			
		||||
    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
 | 
			
		||||
 | 
			
		||||
    cpdef np.ndarray[long, ndim=2] to_array(self, object features)
 | 
			
		||||
    cpdef long[:,:] to_array(self, object features)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class Token:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
# cython: profile=True
 | 
			
		||||
# cython: embedsignature=True
 | 
			
		||||
from cython.view cimport array as cvarray
 | 
			
		||||
 | 
			
		||||
from preshed.maps cimport PreshMap
 | 
			
		||||
from preshed.counter cimport PreshCounter
 | 
			
		||||
| 
						 | 
				
			
			@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA
 | 
			
		|||
 | 
			
		||||
cimport cython
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEF PADDING = 5
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -115,7 +112,7 @@ cdef class Tokens:
 | 
			
		|||
        return idx + t.lex.length
 | 
			
		||||
 | 
			
		||||
    @cython.boundscheck(False)
 | 
			
		||||
    cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids):
 | 
			
		||||
    cpdef long[:,:] to_array(self, object attr_ids):
 | 
			
		||||
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
 | 
			
		||||
        of shape N*M, where N is the length of the sentence.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -129,8 +126,8 @@ cdef class Tokens:
 | 
			
		|||
        """
 | 
			
		||||
        cdef int i, j
 | 
			
		||||
        cdef attr_id_t feature
 | 
			
		||||
        cdef np.ndarray[long, ndim=2] output
 | 
			
		||||
        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
 | 
			
		||||
        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
 | 
			
		||||
                                        itemsize=sizeof(long), format="l")
 | 
			
		||||
        for i in range(self.length):
 | 
			
		||||
            for j, feature in enumerate(attr_ids):
 | 
			
		||||
                output[i, j] = get_token_attr(&self.data[i], feature)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
import os
 | 
			
		||||
from os import path
 | 
			
		||||
import codecs
 | 
			
		||||
import ujson
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 | 
			
		||||
| 
						 | 
				
			
			@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
 | 
			
		|||
 | 
			
		||||
def read_lang_data(data_dir):
 | 
			
		||||
    with open(path.join(data_dir, 'specials.json')) as file_:
 | 
			
		||||
        tokenization = ujson.load(file_)
 | 
			
		||||
        tokenization = json.load(file_)
 | 
			
		||||
    prefix = read_prefix(data_dir)
 | 
			
		||||
    suffix = read_suffix(data_dir)
 | 
			
		||||
    infix = read_infix(data_dir)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user