* Make PyPy work

2025-10-20 10:44:41 +03:00 · 2015-01-05 17:54:13 +11:00 · 2015-01-05 17:54:13 +11:00 · 3f1944d688
commit 3f1944d688
parent 1dd663ea03
14 changed files with 55 additions and 70 deletions
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from __future__ import unicode_literals
 from __future__ import division

--- a/spacy/en/attrs.pyx
+++ b/spacy/en/attrs.pyx
@ -1,20 +1,19 @@
 # cython: embedsignature=True
-from .. import orth
+from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
+from ..orth cimport is_title, is_upper, like_url, like_number
 from ..typedefs cimport flags_t


 def get_flags(unicode string):
    cdef flags_t flags = 0
-    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_upper(string) << IS_UPPER
-    flags |= orth.like_url(string) << LIKE_URL
-    flags |= orth.like_number(string) << LIKE_NUM
+    flags |= is_alpha(string) << IS_ALPHA
+    flags |= is_ascii(string) << IS_ASCII
+    flags |= is_digit(string) << IS_DIGIT
+    flags |= is_lower(string) << IS_LOWER
+    flags |= is_punct(string) << IS_PUNCT
+    flags |= is_space(string) << IS_SPACE
+    flags |= is_title(string) << IS_TITLE
+    flags |= is_upper(string) << IS_UPPER
+    flags |= like_url(string) << LIKE_URL
+    flags |= like_number(string) << LIKE_NUM
    return flags
-
-
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from os import path
 import json
 import os
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 # cython: embedsignature=True


--- a/spacy/orth.pxd
+++ b/spacy/orth.pxd
@ -0,0 +1,12 @@
+cpdef bint is_alpha(unicode string)
+cpdef bint is_digit(unicode string)
+cpdef bint is_punct(unicode string)
+cpdef bint is_space(unicode string)
+cpdef bint is_ascii(unicode string)
+cpdef bint is_title(unicode string)
+cpdef bint is_lower(unicode string)
+cpdef bint is_upper(unicode string)
+cpdef bint like_url(unicode string)
+cpdef bint like_number(unicode string)
+cpdef unicode word_shape(unicode string)
+cpdef bytes asciied(unicode string)
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -11,15 +11,15 @@ TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split


 # Binary string features
-def is_alpha(string):
+cpdef bint is_alpha(unicode string):
    return string.isalpha()


-def is_digit(string):
+cpdef bint is_digit(unicode string):
    return string.isdigit()


-def is_punct(string):
+cpdef bint is_punct(unicode string):
    for c in string:
        if not unicodedata.category(c).startswith('P'):
            return False
@ -27,11 +27,11 @@ def is_punct(string):
        return True


-def is_space(string):
+cpdef bint is_space(unicode string):
    return string.isspace()


-def is_ascii(string):
+cpdef bint is_ascii(unicode string):
    for c in string:
        if ord(c) >= 128:
            return False
@ -39,15 +39,15 @@ def is_ascii(string):
        return True


-def is_title(string):
+cpdef bint is_title(unicode string):
    return string.istitle()


-def is_lower(string):
+cpdef bint is_lower(unicode string):
    return string.islower()


-def is_upper(string):
+cpdef bint is_upper(unicode string):
    return string.isupper()


@ -66,7 +66,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
        "wf|ws|ye|yt|za|zm|zw".split('|'))


-def like_url(string):
+cpdef bint like_url(unicode string):
    # We're looking for things that function in text like URLs. So, valid URL
    # or not, anything they say http:// is going to be good.
    if string.startswith('http://'):
@ -74,7 +74,14 @@ def like_url(string):
    elif string.startswith('www.') and len(string) >= 5:
        return True
    # No dots? Not URLish enough
-    if string[0] == '.' or string[-1] == '.' or '.' not in string:
+    if string[0] == '.' or string[-1] == '.':
+        return False
+    # This should be a call to "in", but PyPy lacks this function?
+    cdef int i
+    for i in range(len(string)):
+        if string[i] == '.':
+            break
+    else:
        return False
    tld = string.rsplit('.', 1)[1].split(':', 1)[0]
    if tld.endswith('/'):
@ -90,7 +97,7 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
                'eighteen nineteen twenty thirty forty fifty sixty seventy'
                'eighty ninety hundred thousand million billion trillion'
                'quadrillion gajillion bazillion'.split())
-def like_number(string):
+cpdef bint like_number(unicode string):
    string = string.replace(',', '')
    string = string.replace('.', '')
    if string.isdigit():
@ -103,30 +110,8 @@ def like_number(string):
        return True
    return False

-# Statistics features
-def oft_case(name, thresh):
-    def wrapped(string, prob, case_stats, tag_stats):
-        return string
-    return wrapped

-
-def can_tag(name, thresh=0.5):
-    def wrapped(string, prob, case_stats, tag_stats):
-        return string
-    return wrapped
-
-
-# String features
-def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
-    if upper_pc >= lower_pc and upper_pc >= title_pc:
-        return string.upper()
-    elif title_pc >= lower_pc:
-        return string.title()
-    else:
-        return string.lower()
-
-
-def word_shape(string):
+cpdef unicode word_shape(unicode string):
    length = len(string)
    shape = []
    last = ""
@ -152,7 +137,7 @@ def word_shape(string):
    return ''.join(shape)


-def asciied(string):
+cpdef bytes asciied(unicode string):
    ascii_string = unidecode(string)
    if not ascii_string:
        return b'???'
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 """
 Fill an array, context, with every _atomic_ value our features reference.
 We then write the _actual features_ as tuples of the atoms. The machinery
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from libc.string cimport memmove
 from cymem.cymem cimport Pool

--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from ._state cimport State
 from ._state cimport has_head, get_idx, get_s0, get_n0
 from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 """
 MALT-style dependency parser
 """
@ -54,7 +53,7 @@ def get_templates(name):
    if name == 'zhang':
        return pf.unigrams, pf.arc_eager
    else:
-        return pf.hasty, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
+        return pf.unigrams, (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
                             pf.tree_shape + pf.trigrams)


@ -64,7 +63,8 @@ cdef class GreedyParser:
        self.cfg = Config.read(model_dir, 'config')
        self.moves = TransitionSystem(self.cfg.left_labels, self.cfg.right_labels)
        hasty_templ, full_templ = get_templates(self.cfg.features)
-        self.model = Model(self.moves.n_moves, full_templ, model_dir)
+        self.model = HastyModel(self.moves.n_moves, hasty_templ, full_templ,
+                                model_dir)

    cpdef int parse(self, Tokens tokens) except -1:
        cdef:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 # cython: embedsignature=True
 from __future__ import unicode_literals

--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,7 +1,6 @@
 from libc.stdint cimport uint32_t

-import numpy as np
-cimport numpy as np
+from cython.view cimport array as cvarray

 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
@ -39,7 +38,7 @@ cdef class Tokens:

    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1

-    cpdef np.ndarray[long, ndim=2] to_array(self, object features)
+    cpdef long[:,:] to_array(self, object features)


 cdef class Token:
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,5 +1,5 @@
-# cython: profile=True
 # cython: embedsignature=True
+from cython.view cimport array as cvarray

 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
@ -12,9 +12,6 @@ from .typedefs cimport POS, LEMMA

 cimport cython

-import numpy as np
-cimport numpy as np
-

 DEF PADDING = 5

@ -115,7 +112,7 @@ cdef class Tokens:
        return idx + t.lex.length

    @cython.boundscheck(False)
-    cpdef np.ndarray[long, ndim=2] to_array(self, object attr_ids):
+    cpdef long[:,:] to_array(self, object attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
        of shape N*M, where N is the length of the sentence.

@ -129,8 +126,8 @@ cdef class Tokens:
        """
        cdef int i, j
        cdef attr_id_t feature
-        cdef np.ndarray[long, ndim=2] output
-        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
+        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
+                                        itemsize=sizeof(long), format="l")
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.data[i], feature)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,7 +1,7 @@
 import os
 from os import path
 import codecs
-import ujson
+import json
 import re

 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):

 def read_lang_data(data_dir):
    with open(path.join(data_dir, 'specials.json')) as file_:
-        tokenization = ujson.load(file_)
+        tokenization = json.load(file_)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)