mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Make PyPy work
This commit is contained in:
parent
1dd663ea03
commit
3f1944d688
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
|
||||
|
|
|
@ -1,20 +1,19 @@
|
|||
# cython: embedsignature=True
|
||||
from .. import orth
|
||||
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
|
||||
from ..orth cimport is_title, is_upper, like_url, like_number
|
||||
from ..typedefs cimport flags_t
|
||||
|
||||
|
||||
def get_flags(unicode string):
|
||||
cdef flags_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUM
|
||||
flags |= is_alpha(string) << IS_ALPHA
|
||||
flags |= is_ascii(string) << IS_ASCII
|
||||
flags |= is_digit(string) << IS_DIGIT
|
||||
flags |= is_lower(string) << IS_LOWER
|
||||
flags |= is_punct(string) << IS_PUNCT
|
||||
flags |= is_space(string) << IS_SPACE
|
||||
flags |= is_title(string) << IS_TITLE
|
||||
flags |= is_upper(string) << IS_UPPER
|
||||
flags |= like_url(string) << LIKE_URL
|
||||
flags |= like_number(string) << LIKE_NUM
|
||||
return flags
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from os import path
|
||||
import json
|
||||
import os
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
|
||||
|
||||
|
|
12
spacy/orth.pxd
Normal file
12
spacy/orth.pxd
Normal file
|
@ -0,0 +1,12 @@
|
|||
cpdef bint is_alpha(unicode string)
|
||||
cpdef bint is_digit(unicode string)
|
||||
cpdef bint is_punct(unicode string)
|
||||
cpdef bint is_space(unicode string)
|
||||
cpdef bint is_ascii(unicode string)
|
||||
cpdef bint is_title(unicode string)
|
||||
cpdef bint is_lower(unicode string)
|
||||
cpdef bint is_upper(unicode string)
|
||||
cpdef bint like_url(unicode string)
|
||||
cpdef bint like_number(unicode string)
|
||||
cpdef unicode word_shape(unicode string)
|
||||
cpdef bytes asciied(unicode string)
|
|
@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split
|
|||
|
||||
|
||||
# Binary string features
|
||||
def is_alpha(string):
|
||||
cpdef bint is_alpha(unicode string):
|
||||
return string.isalpha()
|
||||
|
||||
|
||||
def is_digit(string):
|
||||
cpdef bint is_digit(unicode string):
|
||||
return string.isdigit()
|
||||
|
||||
|
||||
def is_punct(string):
|
||||
cpdef bint is_punct(unicode string):
|
||||
for c in string:
|
||||
if not unicodedata.category(c).startswith('P'):
|
||||
return False
|
||||
|
@ -27,11 +27,11 @@ def is_punct(string):
|
|||
return True
|
||||
|
||||
|
||||
def is_space(string):
|
||||
cpdef bint is_space(unicode string):
|
||||
return string.isspace()
|
||||
|
||||
|
||||
def is_ascii(string):
|
||||
cpdef bint is_ascii(unicode string):
|
||||
for c in string:
|
||||
if ord(c) >= 128:
|
||||
return False
|
||||
|
@ -39,15 +39,15 @@ def is_ascii(string):
|
|||
return True
|
||||
|
||||
|
||||
def is_title(string):
|
||||
cpdef bint is_title(unicode string):
|
||||
return string.istitle()
|
||||
|
||||
|
||||
def is_lower(string):
|
||||
cpdef bint is_lower(unicode string):
|
||||
return string.islower()
|
||||
|
||||
|
||||
def is_upper(string):
|
||||
cpdef bint is_upper(unicode string):
|
||||
return string.isupper()
|
||||
|
||||
|
||||
|
@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
|||
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
||||
|
||||
|
||||
def like_url(string):
|
||||
cpdef bint like_url(unicode string):
|
||||
# We're looking for things that function in text like URLs. So, valid URL
|
||||
# or not, anything they say http:// is going to be good.
|
||||
if string.startswith('http://'):
|
||||
|
@ -74,7 +74,14 @@ def like_url(string):
|
|||
elif string.startswith('www.') and len(string) >= 5:
|
||||
return True
|
||||
# No dots? Not URLish enough
|
||||
if string[0] == '.' or string[-1] == '.' or '.' not in string:
|
||||
if string[0] == '.' or string[-1] == '.':
|
||||
return False
|
||||
# This should be a call to "in", but PyPy lacks this function?
|
||||
cdef int i
|
||||
for i in range(len(string)):
|
||||
if string[i] == '.':
|
||||
break
|
||||
else:
|
||||
return False
|
||||
tld = string.rsplit('.', 1)[1].split(':', 1)[0]
|
||||
if tld.endswith('/'):
|
||||
|
@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
|||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||
'eighty ninety hundred thousand million billion trillion'
|
||||
'quadrillion gajillion bazillion'.split())
|
||||
def like_number(string):
|
||||
cpdef bint like_number(unicode string):
|
||||
string = string.replace(',', '')
|
||||
string = string.replace('.', '')
|
||||
if string.isdigit():
|
||||
|
@ -103,30 +110,8 @@ def like_number(string):
|
|||
return True
|
||||
return False
|
||||
|
||||
# Statistics features
|
||||
def oft_case(name, thresh):
|
||||
def wrapped(string, prob, case_stats, tag_stats):
|
||||
return string
|
||||
return wrapped
|
||||
|
||||
|
||||
def can_tag(name, thresh=0.5):
|
||||
def wrapped(string, prob, case_stats, tag_stats):
|
||||
return string
|
||||
return wrapped
|
||||
|
||||
|
||||
# String features
|
||||
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
|
||||
if upper_pc >= lower_pc and upper_pc >= title_pc:
|
||||
return string.upper()
|
||||
elif title_pc >= lower_pc:
|
||||
return string.title()
|
||||
else:
|
||||
return string.lower()
|
||||
|
||||
|
||||
def word_shape(string):
|
||||
cpdef unicode word_shape(unicode string):
|
||||
length = len(string)
|
||||
shape = []
|
||||
last = ""
|
||||
|
@ -152,7 +137,7 @@ def word_shape(string):
|
|||
return ''.join(shape)
|
||||
|
||||
|
||||
def asciied(string):
|
||||
cpdef bytes asciied(unicode string):
|
||||
ascii_string = unidecode(string)
|
||||
if not ascii_string:
|
||||
return b'???'
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
"""
|
||||
Fill an array, context, with every _atomic_ value our features reference.
|
||||
We then write the _actual features_ as tuples of the atoms. The machinery
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from libc.string cimport memmove
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
from ._state cimport State
|
||||
from ._state cimport has_head, get_idx, get_s0, get_n0
|
||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
|
@ -54,7 +53,7 @@ def get_templates(name):
|
|||
if name == 'zhang':
|
||||
return pf.unigrams, pf.arc_eager
|
||||
else:
|
||||
return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
||||
|
||||
|
@ -64,7 +63,8 @@ cdef class GreedyParser:
|
|||
self.cfg = Config.read(model_dir, 'config')
|
||||
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
|
||||
hasty_templ, full_templ = get_templates(self.cfg.features)
|
||||
self.model = Model(self.moves.n_moves, full_templ, model_dir)
|
||||
self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
|
||||
model_dir)
|
||||
|
||||
cpdef int parse(self, Tokens tokens) except -1:
|
||||
cdef:
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
from cython.view cimport array as cvarray
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
@ -39,7 +38,7 @@ cdef class Tokens:
|
|||
|
||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||
|
||||
cpdef np.ndarray[long, ndim=2] to_array(self, object features)
|
||||
cpdef long[:,:] to_array(self, object features)
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
from cython.view cimport array as cvarray
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
|
@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA
|
|||
|
||||
cimport cython
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -115,7 +112,7 @@ cdef class Tokens:
|
|||
return idx + t.lex.length
|
||||
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids):
|
||||
cpdef long[:,:] to_array(self, object attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
of shape N*M, where N is the length of the sentence.
|
||||
|
||||
|
@ -129,8 +126,8 @@ cdef class Tokens:
|
|||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[long, ndim=2] output
|
||||
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||
cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
|
||||
itemsize=sizeof(long), format="l")
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.data[i], feature)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
from os import path
|
||||
import codecs
|
||||
import ujson
|
||||
import json
|
||||
import re
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||
|
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
|
|||
|
||||
def read_lang_data(data_dir):
|
||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||
tokenization = ujson.load(file_)
|
||||
tokenization = json.load(file_)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
infix = read_infix(data_dir)
|
||||
|
|
Loading…
Reference in New Issue
Block a user