mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Make PyPy work
This commit is contained in:
parent
1dd663ea03
commit
3f1944d688
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,19 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
from .. import orth
|
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
|
||||||
|
from ..orth cimport is_title, is_upper, like_url, like_number
|
||||||
from ..typedefs cimport flags_t
|
from ..typedefs cimport flags_t
|
||||||
|
|
||||||
|
|
||||||
def get_flags(unicode string):
|
def get_flags(unicode string):
|
||||||
cdef flags_t flags = 0
|
cdef flags_t flags = 0
|
||||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
flags |= is_alpha(string) << IS_ALPHA
|
||||||
flags |= orth.is_ascii(string) << IS_ASCII
|
flags |= is_ascii(string) << IS_ASCII
|
||||||
flags |= orth.is_digit(string) << IS_DIGIT
|
flags |= is_digit(string) << IS_DIGIT
|
||||||
flags |= orth.is_lower(string) << IS_LOWER
|
flags |= is_lower(string) << IS_LOWER
|
||||||
flags |= orth.is_punct(string) << IS_PUNCT
|
flags |= is_punct(string) << IS_PUNCT
|
||||||
flags |= orth.is_space(string) << IS_SPACE
|
flags |= is_space(string) << IS_SPACE
|
||||||
flags |= orth.is_title(string) << IS_TITLE
|
flags |= is_title(string) << IS_TITLE
|
||||||
flags |= orth.is_upper(string) << IS_UPPER
|
flags |= is_upper(string) << IS_UPPER
|
||||||
flags |= orth.like_url(string) << LIKE_URL
|
flags |= like_url(string) << LIKE_URL
|
||||||
flags |= orth.like_number(string) << LIKE_NUM
|
flags |= like_number(string) << LIKE_NUM
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from os import path
|
from os import path
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
|
||||||
|
|
||||||
|
|
12
spacy/orth.pxd
Normal file
12
spacy/orth.pxd
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
cpdef bint is_alpha(unicode string)
|
||||||
|
cpdef bint is_digit(unicode string)
|
||||||
|
cpdef bint is_punct(unicode string)
|
||||||
|
cpdef bint is_space(unicode string)
|
||||||
|
cpdef bint is_ascii(unicode string)
|
||||||
|
cpdef bint is_title(unicode string)
|
||||||
|
cpdef bint is_lower(unicode string)
|
||||||
|
cpdef bint is_upper(unicode string)
|
||||||
|
cpdef bint like_url(unicode string)
|
||||||
|
cpdef bint like_number(unicode string)
|
||||||
|
cpdef unicode word_shape(unicode string)
|
||||||
|
cpdef bytes asciied(unicode string)
|
|
@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split
|
||||||
|
|
||||||
|
|
||||||
# Binary string features
|
# Binary string features
|
||||||
def is_alpha(string):
|
cpdef bint is_alpha(unicode string):
|
||||||
return string.isalpha()
|
return string.isalpha()
|
||||||
|
|
||||||
|
|
||||||
def is_digit(string):
|
cpdef bint is_digit(unicode string):
|
||||||
return string.isdigit()
|
return string.isdigit()
|
||||||
|
|
||||||
|
|
||||||
def is_punct(string):
|
cpdef bint is_punct(unicode string):
|
||||||
for c in string:
|
for c in string:
|
||||||
if not unicodedata.category(c).startswith('P'):
|
if not unicodedata.category(c).startswith('P'):
|
||||||
return False
|
return False
|
||||||
|
@ -27,11 +27,11 @@ def is_punct(string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_space(string):
|
cpdef bint is_space(unicode string):
|
||||||
return string.isspace()
|
return string.isspace()
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(string):
|
cpdef bint is_ascii(unicode string):
|
||||||
for c in string:
|
for c in string:
|
||||||
if ord(c) >= 128:
|
if ord(c) >= 128:
|
||||||
return False
|
return False
|
||||||
|
@ -39,15 +39,15 @@ def is_ascii(string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_title(string):
|
cpdef bint is_title(unicode string):
|
||||||
return string.istitle()
|
return string.istitle()
|
||||||
|
|
||||||
|
|
||||||
def is_lower(string):
|
cpdef bint is_lower(unicode string):
|
||||||
return string.islower()
|
return string.islower()
|
||||||
|
|
||||||
|
|
||||||
def is_upper(string):
|
cpdef bint is_upper(unicode string):
|
||||||
return string.isupper()
|
return string.isupper()
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
||||||
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
||||||
|
|
||||||
|
|
||||||
def like_url(string):
|
cpdef bint like_url(unicode string):
|
||||||
# We're looking for things that function in text like URLs. So, valid URL
|
# We're looking for things that function in text like URLs. So, valid URL
|
||||||
# or not, anything they say http:// is going to be good.
|
# or not, anything they say http:// is going to be good.
|
||||||
if string.startswith('http://'):
|
if string.startswith('http://'):
|
||||||
|
@ -74,7 +74,14 @@ def like_url(string):
|
||||||
elif string.startswith('www.') and len(string) >= 5:
|
elif string.startswith('www.') and len(string) >= 5:
|
||||||
return True
|
return True
|
||||||
# No dots? Not URLish enough
|
# No dots? Not URLish enough
|
||||||
if string[0] == '.' or string[-1] == '.' or '.' not in string:
|
if string[0] == '.' or string[-1] == '.':
|
||||||
|
return False
|
||||||
|
# This should be a call to "in", but PyPy lacks this function?
|
||||||
|
cdef int i
|
||||||
|
for i in range(len(string)):
|
||||||
|
if string[i] == '.':
|
||||||
|
break
|
||||||
|
else:
|
||||||
return False
|
return False
|
||||||
tld = string.rsplit('.', 1)[1].split(':', 1)[0]
|
tld = string.rsplit('.', 1)[1].split(':', 1)[0]
|
||||||
if tld.endswith('/'):
|
if tld.endswith('/'):
|
||||||
|
@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||||
'eighty ninety hundred thousand million billion trillion'
|
'eighty ninety hundred thousand million billion trillion'
|
||||||
'quadrillion gajillion bazillion'.split())
|
'quadrillion gajillion bazillion'.split())
|
||||||
def like_number(string):
|
cpdef bint like_number(unicode string):
|
||||||
string = string.replace(',', '')
|
string = string.replace(',', '')
|
||||||
string = string.replace('.', '')
|
string = string.replace('.', '')
|
||||||
if string.isdigit():
|
if string.isdigit():
|
||||||
|
@ -103,30 +110,8 @@ def like_number(string):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Statistics features
|
|
||||||
def oft_case(name, thresh):
|
|
||||||
def wrapped(string, prob, case_stats, tag_stats):
|
|
||||||
return string
|
|
||||||
return wrapped
|
|
||||||
|
|
||||||
|
cpdef unicode word_shape(unicode string):
|
||||||
def can_tag(name, thresh=0.5):
|
|
||||||
def wrapped(string, prob, case_stats, tag_stats):
|
|
||||||
return string
|
|
||||||
return wrapped
|
|
||||||
|
|
||||||
|
|
||||||
# String features
|
|
||||||
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
|
|
||||||
if upper_pc >= lower_pc and upper_pc >= title_pc:
|
|
||||||
return string.upper()
|
|
||||||
elif title_pc >= lower_pc:
|
|
||||||
return string.title()
|
|
||||||
else:
|
|
||||||
return string.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def word_shape(string):
|
|
||||||
length = len(string)
|
length = len(string)
|
||||||
shape = []
|
shape = []
|
||||||
last = ""
|
last = ""
|
||||||
|
@ -152,7 +137,7 @@ def word_shape(string):
|
||||||
return ''.join(shape)
|
return ''.join(shape)
|
||||||
|
|
||||||
|
|
||||||
def asciied(string):
|
cpdef bytes asciied(unicode string):
|
||||||
ascii_string = unidecode(string)
|
ascii_string = unidecode(string)
|
||||||
if not ascii_string:
|
if not ascii_string:
|
||||||
return b'???'
|
return b'???'
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
"""
|
"""
|
||||||
Fill an array, context, with every _atomic_ value our features reference.
|
Fill an array, context, with every _atomic_ value our features reference.
|
||||||
We then write the _actual features_ as tuples of the atoms. The machinery
|
We then write the _actual features_ as tuples of the atoms. The machinery
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from libc.string cimport memmove
|
from libc.string cimport memmove
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
from ._state cimport State
|
from ._state cimport State
|
||||||
from ._state cimport has_head, get_idx, get_s0, get_n0
|
from ._state cimport has_head, get_idx, get_s0, get_n0
|
||||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
"""
|
"""
|
||||||
MALT-style dependency parser
|
MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
|
@ -54,7 +53,7 @@ def get_templates(name):
|
||||||
if name == 'zhang':
|
if name == 'zhang':
|
||||||
return pf.unigrams, pf.arc_eager
|
return pf.unigrams, pf.arc_eager
|
||||||
else:
|
else:
|
||||||
return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||||
pf.tree_shape + pf.trigrams)
|
pf.tree_shape + pf.trigrams)
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,7 +63,8 @@ cdef class GreedyParser:
|
||||||
self.cfg = Config.read(model_dir, 'config')
|
self.cfg = Config.read(model_dir, 'config')
|
||||||
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
|
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
|
||||||
hasty_templ, full_templ = get_templates(self.cfg.features)
|
hasty_templ, full_templ = get_templates(self.cfg.features)
|
||||||
self.model = Model(self.moves.n_moves, full_templ, model_dir)
|
self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
|
||||||
|
model_dir)
|
||||||
|
|
||||||
cpdef int parse(self, Tokens tokens) except -1:
|
cpdef int parse(self, Tokens tokens) except -1:
|
||||||
cdef:
|
cdef:
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
import numpy as np
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
@ -39,7 +38,7 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||||
|
|
||||||
cpdef np.ndarray[long, ndim=2] to_array(self, object features)
|
cpdef long[:,:] to_array(self, object features)
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: profile=True
|
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
from cython.view cimport array as cvarray
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from preshed.counter cimport PreshCounter
|
from preshed.counter cimport PreshCounter
|
||||||
|
@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -115,7 +112,7 @@ cdef class Tokens:
|
||||||
return idx + t.lex.length
|
return idx + t.lex.length
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids):
|
cpdef long[:,:] to_array(self, object attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||||
of shape N*M, where N is the length of the sentence.
|
of shape N*M, where N is the length of the sentence.
|
||||||
|
|
||||||
|
@ -129,8 +126,8 @@ cdef class Tokens:
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
cdef np.ndarray[long, ndim=2] output
|
cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
|
||||||
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
itemsize=sizeof(long), format="l")
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_token_attr(&self.data[i], feature)
|
output[i, j] = get_token_attr(&self.data[i], feature)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
import ujson
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||||
|
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
def read_lang_data(data_dir):
|
def read_lang_data(data_dir):
|
||||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||||
tokenization = ujson.load(file_)
|
tokenization = json.load(file_)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
infix = read_infix(data_dir)
|
infix = read_infix(data_dir)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user