* Make PyPy work

This commit is contained in:
Matthew Honnibal 2015-01-05 17:54:13 +11:00
parent 1dd663ea03
commit 3f1944d688
14 changed files with 55 additions and 70 deletions

View File

@ -1,4 +1,3 @@
# cython: profile=True
from __future__ import unicode_literals from __future__ import unicode_literals
from __future__ import division from __future__ import division

View File

@ -1,20 +1,19 @@
# cython: embedsignature=True # cython: embedsignature=True
from .. import orth from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
from ..orth cimport is_title, is_upper, like_url, like_number
from ..typedefs cimport flags_t from ..typedefs cimport flags_t
def get_flags(unicode string): def get_flags(unicode string):
cdef flags_t flags = 0 cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA flags |= is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII flags |= is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT flags |= is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER flags |= is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT flags |= is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE flags |= is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE flags |= is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER flags |= is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL flags |= like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUM flags |= like_number(string) << LIKE_NUM
return flags return flags

View File

@ -1,4 +1,3 @@
# cython: profile=True
from os import path from os import path
import json import json
import os import os

View File

@ -1,4 +1,3 @@
# cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True

12
spacy/orth.pxd Normal file
View File

@ -0,0 +1,12 @@
cpdef bint is_alpha(unicode string)
cpdef bint is_digit(unicode string)
cpdef bint is_punct(unicode string)
cpdef bint is_space(unicode string)
cpdef bint is_ascii(unicode string)
cpdef bint is_title(unicode string)
cpdef bint is_lower(unicode string)
cpdef bint is_upper(unicode string)
cpdef bint like_url(unicode string)
cpdef bint like_number(unicode string)
cpdef unicode word_shape(unicode string)
cpdef bytes asciied(unicode string)

View File

@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split
# Binary string features # Binary string features
def is_alpha(string): cpdef bint is_alpha(unicode string):
return string.isalpha() return string.isalpha()
def is_digit(string): cpdef bint is_digit(unicode string):
return string.isdigit() return string.isdigit()
def is_punct(string): cpdef bint is_punct(unicode string):
for c in string: for c in string:
if not unicodedata.category(c).startswith('P'): if not unicodedata.category(c).startswith('P'):
return False return False
@ -27,11 +27,11 @@ def is_punct(string):
return True return True
def is_space(string): cpdef bint is_space(unicode string):
return string.isspace() return string.isspace()
def is_ascii(string): cpdef bint is_ascii(unicode string):
for c in string: for c in string:
if ord(c) >= 128: if ord(c) >= 128:
return False return False
@ -39,15 +39,15 @@ def is_ascii(string):
return True return True
def is_title(string): cpdef bint is_title(unicode string):
return string.istitle() return string.istitle()
def is_lower(string): cpdef bint is_lower(unicode string):
return string.islower() return string.islower()
def is_upper(string): cpdef bint is_upper(unicode string):
return string.isupper() return string.isupper()
@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
"wf|ws|ye|yt|za|zm|zw".split('|')) "wf|ws|ye|yt|za|zm|zw".split('|'))
def like_url(string): cpdef bint like_url(unicode string):
# We're looking for things that function in text like URLs. So, valid URL # We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good. # or not, anything they say http:// is going to be good.
if string.startswith('http://'): if string.startswith('http://'):
@ -74,7 +74,14 @@ def like_url(string):
elif string.startswith('www.') and len(string) >= 5: elif string.startswith('www.') and len(string) >= 5:
return True return True
# No dots? Not URLish enough # No dots? Not URLish enough
if string[0] == '.' or string[-1] == '.' or '.' not in string: if string[0] == '.' or string[-1] == '.':
return False
# This should be a call to "in", but PyPy lacks this function?
cdef int i
for i in range(len(string)):
if string[i] == '.':
break
else:
return False return False
tld = string.rsplit('.', 1)[1].split(':', 1)[0] tld = string.rsplit('.', 1)[1].split(':', 1)[0]
if tld.endswith('/'): if tld.endswith('/'):
@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eighteen nineteen twenty thirty forty fifty sixty seventy' 'eighteen nineteen twenty thirty forty fifty sixty seventy'
'eighty ninety hundred thousand million billion trillion' 'eighty ninety hundred thousand million billion trillion'
'quadrillion gajillion bazillion'.split()) 'quadrillion gajillion bazillion'.split())
def like_number(string): cpdef bint like_number(unicode string):
string = string.replace(',', '') string = string.replace(',', '')
string = string.replace('.', '') string = string.replace('.', '')
if string.isdigit(): if string.isdigit():
@ -103,30 +110,8 @@ def like_number(string):
return True return True
return False return False
# Statistics features
def oft_case(name, thresh):
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
cpdef unicode word_shape(unicode string):
def can_tag(name, thresh=0.5):
def wrapped(string, prob, case_stats, tag_stats):
return string
return wrapped
# String features
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
if upper_pc >= lower_pc and upper_pc >= title_pc:
return string.upper()
elif title_pc >= lower_pc:
return string.title()
else:
return string.lower()
def word_shape(string):
length = len(string) length = len(string)
shape = [] shape = []
last = "" last = ""
@ -152,7 +137,7 @@ def word_shape(string):
return ''.join(shape) return ''.join(shape)
def asciied(string): cpdef bytes asciied(unicode string):
ascii_string = unidecode(string) ascii_string = unidecode(string)
if not ascii_string: if not ascii_string:
return b'???' return b'???'

View File

@ -1,4 +1,3 @@
# cython: profile=True
""" """
Fill an array, context, with every _atomic_ value our features reference. Fill an array, context, with every _atomic_ value our features reference.
We then write the _actual features_ as tuples of the atoms. The machinery We then write the _actual features_ as tuples of the atoms. The machinery

View File

@ -1,4 +1,3 @@
# cython: profile=True
from libc.string cimport memmove from libc.string cimport memmove
from cymem.cymem cimport Pool from cymem.cymem cimport Pool

View File

@ -1,4 +1,3 @@
# cython: profile=True
from ._state cimport State from ._state cimport State
from ._state cimport has_head, get_idx, get_s0, get_n0 from ._state cimport has_head, get_idx, get_s0, get_n0
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep

View File

@ -1,4 +1,3 @@
# cython: profile=True
""" """
MALT-style dependency parser MALT-style dependency parser
""" """
@ -54,7 +53,7 @@ def get_templates(name):
if name == 'zhang': if name == 'zhang':
return pf.unigrams, pf.arc_eager return pf.unigrams, pf.arc_eager
else: else:
return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams) pf.tree_shape + pf.trigrams)
@ -64,7 +63,8 @@ cdef class GreedyParser:
self.cfg = Config.read(model_dir, 'config') self.cfg = Config.read(model_dir, 'config')
self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels) self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
hasty_templ, full_templ = get_templates(self.cfg.features) hasty_templ, full_templ = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, full_templ, model_dir) self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
model_dir)
cpdef int parse(self, Tokens tokens) except -1: cpdef int parse(self, Tokens tokens) except -1:
cdef: cdef:

View File

@ -1,4 +1,3 @@
# cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True
from __future__ import unicode_literals from __future__ import unicode_literals

View File

@ -1,7 +1,6 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
import numpy as np from cython.view cimport array as cvarray
cimport numpy as np
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
@ -39,7 +38,7 @@ cdef class Tokens:
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef np.ndarray[long, ndim=2] to_array(self, object features) cpdef long[:,:] to_array(self, object features)
cdef class Token: cdef class Token:

View File

@ -1,5 +1,5 @@
# cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True
from cython.view cimport array as cvarray
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter from preshed.counter cimport PreshCounter
@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA
cimport cython cimport cython
import numpy as np
cimport numpy as np
DEF PADDING = 5 DEF PADDING = 5
@ -115,7 +112,7 @@ cdef class Tokens:
return idx + t.lex.length return idx + t.lex.length
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids): cpdef long[:,:] to_array(self, object attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray """Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence. of shape N*M, where N is the length of the sentence.
@ -129,8 +126,8 @@ cdef class Tokens:
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
cdef np.ndarray[long, ndim=2] output cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) itemsize=sizeof(long), format="l")
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.data[i], feature) output[i, j] = get_token_attr(&self.data[i], feature)

View File

@ -1,7 +1,7 @@
import os import os
from os import path from os import path
import codecs import codecs
import ujson import json
import re import re
DATA_DIR = path.join(path.dirname(__file__), '..', 'data') DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
def read_lang_data(data_dir): def read_lang_data(data_dir):
with open(path.join(data_dir, 'specials.json')) as file_: with open(path.join(data_dir, 'specials.json')) as file_:
tokenization = ujson.load(file_) tokenization = json.load(file_)
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
infix = read_infix(data_dir) infix = read_infix(data_dir)