* Make PyPy work

This commit is contained in:
Matthew Honnibal 2015-01-05 17:54:13 +11:00
parent 1dd663ea03
commit 3f1944d688
14 changed files with 55 additions and 70 deletions

View File

@ -1,4 +1,3 @@
# cython: profile=True
from __future__ import unicode_literals
from __future__ import division

View File

@ -1,20 +1,19 @@
# cython: embedsignature=True
from .. import orth
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
from ..orth cimport is_title, is_upper, like_url, like_number
from ..typedefs cimport flags_t
def get_flags(unicode string):
cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUM
flags |= is_alpha(string) << IS_ALPHA
flags |= is_ascii(string) << IS_ASCII
flags |= is_digit(string) << IS_DIGIT
flags |= is_lower(string) << IS_LOWER
flags |= is_punct(string) << IS_PUNCT
flags |= is_space(string) << IS_SPACE
flags |= is_title(string) << IS_TITLE
flags |= is_upper(string) << IS_UPPER
flags |= like_url(string) << LIKE_URL
flags |= like_number(string) << LIKE_NUM
return flags

View File

@ -1,4 +1,3 @@
# cython: profile=True
from os import path
import json
import os

View File

@ -1,4 +1,3 @@
# cython: profile=True
# cython: embedsignature=True

12
spacy/orth.pxd Normal file
View File

@ -0,0 +1,12 @@
cpdef bint is_alpha(unicode string)
cpdef bint is_digit(unicode string)
cpdef bint is_punct(unicode string)
cpdef bint is_space(unicode string)
cpdef bint is_ascii(unicode string)
cpdef bint is_title(unicode string)
cpdef bint is_lower(unicode string)
cpdef bint is_upper(unicode string)
cpdef bint like_url(unicode string)
cpdef bint like_number(unicode string)
cpdef unicode word_shape(unicode string)
cpdef bytes asciied(unicode string)

View File

@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split
# Binary string features
def is_alpha(string):
cpdef bint is_alpha(unicode string):
return string.isalpha()
def is_digit(string):
cpdef bint is_digit(unicode string):
return string.isdigit()
def is_punct(string):
cpdef bint is_punct(unicode string):
for c in string:
if not unicodedata.category(c).startswith('P'):
return False
@ -27,11 +27,11 @@ def is_punct(string):
return True
def is_space(string):
cpdef bint is_space(unicode string):
return string.isspace()
def is_ascii(string):
cpdef bint is_ascii(unicode string):
for c in string:
if ord(c) >= 128:
return False
@ -39,15 +39,15 @@ def is_ascii(string):
return True
def is_title(string):
cpdef bint is_title(unicode string):
return string.istitle()
def is_lower(string):
cpdef bint is_lower(unicode string):
return string.islower()
def is_upper(string):
cpdef bint is_upper(unicode string):
return string.isupper()
@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
"wf|ws|ye|yt|za|zm|zw".split('|'))
def like_url(string):
cpdef bint like_url(unicode string):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if string.startswith('http://'):
@ -74,7 +74,14 @@ def like_url(string):
elif string.startswith('www.') and len(string) >= 5:
return True
# No dots? Not URLish enough
if string[0] == '.' or string[-1] == '.' or '.' not in string:
if string[0] == '.' or string[-1] == '.':
return False
# This should be a call to "in", but PyPy lacks this function?
cdef int i
for i in range(len(string)):
if string[i] == '.':
break
else:
return False
tld = string.rsplit('.', 1)[1].split(':', 1)[0]
if tld.endswith('/'):
@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eighteen nineteen twenty thirty forty fifty sixty seventy'
'eighty ninety hundred thousand million billion trillion'
'quadrillion gajillion bazillion'.split())
def like_number(string):
cpdef bint like_number(unicode string):
string = string.replace(',', '')
string = string.replace('.', '')
if string.isdigit():
@ -103,30 +110,8 @@ def like_number(string):
return True
return False
# Statistics features
def oft_case(name, thresh):
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
def can_tag(name, thresh=0.5):
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
# String features
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
if upper_pc >= lower_pc and upper_pc >= title_pc:
return string.upper()
elif title_pc >= lower_pc:
return string.title()
else:
return string.lower()
def word_shape(string):
cpdef unicode word_shape(unicode string):
length = len(string)
shape = []
last = ""
@ -152,7 +137,7 @@ def word_shape(string):
return ''.join(shape)
def asciied(string):
cpdef bytes asciied(unicode string):
ascii_string = unidecode(string)
if not ascii_string:
return b'???'

View File

@ -1,4 +1,3 @@
# cython: profile=True
"""
Fill an array, context, with every _atomic_ value our features reference.
We then write the _actual features_ as tuples of the atoms. The machinery

View File

@ -1,4 +1,3 @@
# cython: profile=True
from libc.string cimport memmove
from cymem.cymem cimport Pool

View File

@ -1,4 +1,3 @@
# cython: profile=True
from ._state cimport State
from ._state cimport has_head, get_idx, get_s0, get_n0
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep

View File

@ -1,4 +1,3 @@
# cython: profile=True
"""
MALT-style dependency parser
"""
@ -54,7 +53,7 @@ def get_templates(name):
if name == 'zhang':
return pf.unigrams, pf.arc_eager
else:
return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams)
@ -64,7 +63,8 @@ cdef class GreedyParser:
self.cfg = Config.read(model_dir, 'config')
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
hasty_templ, full_templ = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, full_templ, model_dir)
self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
model_dir)
cpdef int parse(self, Tokens tokens) except -1:
cdef:

View File

@ -1,4 +1,3 @@
# cython: profile=True
# cython: embedsignature=True
from __future__ import unicode_literals

View File

@ -1,7 +1,6 @@
from libc.stdint cimport uint32_t
import numpy as np
cimport numpy as np
from cython.view cimport array as cvarray
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
@ -39,7 +38,7 @@ cdef class Tokens:
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef np.ndarray[long, ndim=2] to_array(self, object features)
cpdef long[:,:] to_array(self, object features)
cdef class Token:

View File

@ -1,5 +1,5 @@
# cython: profile=True
# cython: embedsignature=True
from cython.view cimport array as cvarray
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA
cimport cython
import numpy as np
cimport numpy as np
DEF PADDING = 5
@ -115,7 +112,7 @@ cdef class Tokens:
return idx + t.lex.length
@cython.boundscheck(False)
cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids):
cpdef long[:,:] to_array(self, object attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence.
@ -129,8 +126,8 @@ cdef class Tokens:
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[long, ndim=2] output
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
itemsize=sizeof(long), format="l")
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.data[i], feature)

View File

@ -1,7 +1,7 @@
import os
from os import path
import codecs
import ujson
import json
import re
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
def read_lang_data(data_dir):
with open(path.join(data_dir, 'specials.json')) as file_:
tokenization = ujson.load(file_)
tokenization = json.load(file_)
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)