diff --git a/spacy/__init__.py b/spacy/__init__.py new file mode 100644 index 000000000..7ab2b411d --- /dev/null +++ b/spacy/__init__.py @@ -0,0 +1,28 @@ +from .lexeme import lex_of +from .lexeme import sic_of + + +__all__ = [lex_of, sic_of] + + +""" +from .tokens import ids_from_string +from .tokens import group_by + +from .lex import sic_of +from .lex import lex_of +from .lex import normed_of +from .lex import first_of +from .lex import last_three_of + +from .lex import cluster_of +from .lex import prob_of + +from .lex import is_oft_upper +from .lex import is_oft_title + +from .lex import can_noun +from .lex import can_verb +from .lex import can_adj +from .lex import can_adv +""" diff --git a/spacy/en.cpp b/spacy/en.cpp new file mode 100644 index 000000000..c5d249e9d --- /dev/null +++ b/spacy/en.cpp @@ -0,0 +1,4529 @@ +/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ + +#define PY_SSIZE_T_CLEAN +#ifndef CYTHON_USE_PYLONG_INTERNALS +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 0 +#else +#include "pyconfig.h" +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 1 +#else +#define CYTHON_USE_PYLONG_INTERNALS 0 +#endif +#endif +#endif +#include "Python.h" +#ifndef Py_PYTHON_H + #error Python headers needed to compile C extensions, please install development version of Python. +#elif PY_VERSION_HEX < 0x02040000 + #error Cython requires Python 2.4+. +#else +#define CYTHON_ABI "0_20_1" +#include /* For offsetof */ +#ifndef offsetof +#define offsetof(type, member) ( (size_t) & ((type*)0) -> member ) +#endif +#if !defined(WIN32) && !defined(MS_WINDOWS) + #ifndef __stdcall + #define __stdcall + #endif + #ifndef __cdecl + #define __cdecl + #endif + #ifndef __fastcall + #define __fastcall + #endif +#endif +#ifndef DL_IMPORT + #define DL_IMPORT(t) t +#endif +#ifndef DL_EXPORT + #define DL_EXPORT(t) t +#endif +#ifndef PY_LONG_LONG + #define PY_LONG_LONG LONG_LONG +#endif +#ifndef Py_HUGE_VAL + #define Py_HUGE_VAL HUGE_VAL +#endif +#ifdef PYPY_VERSION +#define CYTHON_COMPILING_IN_PYPY 1 +#define CYTHON_COMPILING_IN_CPYTHON 0 +#else +#define CYTHON_COMPILING_IN_PYPY 0 +#define CYTHON_COMPILING_IN_CPYTHON 1 +#endif +#if CYTHON_COMPILING_IN_PYPY +#define Py_OptimizeFlag 0 +#endif +#if PY_VERSION_HEX < 0x02050000 + typedef int Py_ssize_t; + #define PY_SSIZE_T_MAX INT_MAX + #define PY_SSIZE_T_MIN INT_MIN + #define PY_FORMAT_SIZE_T "" + #define CYTHON_FORMAT_SSIZE_T "" + #define PyInt_FromSsize_t(z) PyInt_FromLong(z) + #define PyInt_AsSsize_t(o) __Pyx_PyInt_As_int(o) + #define PyNumber_Index(o) ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \ + (PyErr_Format(PyExc_TypeError, \ + "expected index value, got %.200s", Py_TYPE(o)->tp_name), \ + (PyObject*)0)) + #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \ + !PyComplex_Check(o)) + #define PyIndex_Check __Pyx_PyIndex_Check + #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message) + #define __PYX_BUILD_PY_SSIZE_T "i" +#else + #define __PYX_BUILD_PY_SSIZE_T "n" + #define CYTHON_FORMAT_SSIZE_T "z" + #define __Pyx_PyIndex_Check PyIndex_Check +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt) + #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) + #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size) + #define PyVarObject_HEAD_INIT(type, size) \ + PyObject_HEAD_INIT(type) size, + #define PyType_Modified(t) + typedef struct { + void *buf; + PyObject *obj; + Py_ssize_t len; + Py_ssize_t itemsize; + int readonly; + int ndim; + char *format; + Py_ssize_t *shape; + Py_ssize_t *strides; + Py_ssize_t *suboffsets; + void *internal; + } Py_buffer; + #define PyBUF_SIMPLE 0 + #define PyBUF_WRITABLE 0x0001 + #define PyBUF_FORMAT 0x0004 + #define PyBUF_ND 0x0008 + #define PyBUF_STRIDES (0x0010 | PyBUF_ND) + #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES) + #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES) + #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES) + #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES) + #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE) + #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE) + typedef int (*getbufferproc)(PyObject *, Py_buffer *, int); + typedef void (*releasebufferproc)(PyObject *, Py_buffer *); +#endif +#if PY_MAJOR_VERSION < 3 + #define __Pyx_BUILTIN_MODULE_NAME "__builtin__" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyClass_Type +#else + #define __Pyx_BUILTIN_MODULE_NAME "builtins" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyType_Type +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict") +#endif +#if PY_MAJOR_VERSION >= 3 + #define Py_TPFLAGS_CHECKTYPES 0 + #define Py_TPFLAGS_HAVE_INDEX 0 +#endif +#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3) + #define Py_TPFLAGS_HAVE_NEWBUFFER 0 +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_TPFLAGS_HAVE_VERSION_TAG 0 +#endif +#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT) + #define Py_TPFLAGS_IS_ABSTRACT 0 +#endif +#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE) + #define Py_TPFLAGS_HAVE_FINALIZE 0 +#endif +#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND) + #define CYTHON_PEP393_ENABLED 1 + #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ? \ + 0 : _PyUnicode_Ready((PyObject *)(op))) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i) + #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u) + #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) + #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) +#else + #define CYTHON_PEP393_ENABLED 0 + #define __Pyx_PyUnicode_READY(op) (0) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i])) + #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE)) + #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u)) + #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i])) +#endif +#if CYTHON_COMPILING_IN_PYPY + #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b) +#else + #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \ + PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b)) +#endif +#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) +#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b) +#else + #define __Pyx_PyString_Format(a, b) PyString_Format(a, b) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBaseString_Type PyUnicode_Type + #define PyStringObject PyUnicodeObject + #define PyString_Type PyUnicode_Type + #define PyString_Check PyUnicode_Check + #define PyString_CheckExact PyUnicode_CheckExact +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyBytesObject PyStringObject + #define PyBytes_Type PyString_Type + #define PyBytes_Check PyString_Check + #define PyBytes_CheckExact PyString_CheckExact + #define PyBytes_FromString PyString_FromString + #define PyBytes_FromStringAndSize PyString_FromStringAndSize + #define PyBytes_FromFormat PyString_FromFormat + #define PyBytes_DecodeEscape PyString_DecodeEscape + #define PyBytes_AsString PyString_AsString + #define PyBytes_AsStringAndSize PyString_AsStringAndSize + #define PyBytes_Size PyString_Size + #define PyBytes_AS_STRING PyString_AS_STRING + #define PyBytes_GET_SIZE PyString_GET_SIZE + #define PyBytes_Repr PyString_Repr + #define PyBytes_Concat PyString_Concat + #define PyBytes_ConcatAndDel PyString_ConcatAndDel +#endif +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) + #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) +#else + #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \ + PyString_Check(obj) || PyUnicode_Check(obj)) + #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj)) +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PySet_Check(obj) PyObject_TypeCheck(obj, &PySet_Type) + #define PyFrozenSet_Check(obj) PyObject_TypeCheck(obj, &PyFrozenSet_Type) +#endif +#ifndef PySet_CheckExact + #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) +#endif +#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type) +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask + #define PyNumber_Int PyNumber_Long +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBoolObject PyLongObject +#endif +#if PY_VERSION_HEX < 0x030200A4 + typedef long Py_hash_t; + #define __Pyx_PyInt_FromHash_t PyInt_FromLong + #define __Pyx_PyInt_AsHash_t PyInt_AsLong +#else + #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t + #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t +#endif +#if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300) + #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value) + #define __Pyx_PySequence_DelSlice(obj, a, b) PySequence_DelSlice(obj, a, b) +#else + #define __Pyx_PySequence_GetSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), (PyObject*)0) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_GetSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object is unsliceable", (obj)->ob_type->tp_name), (PyObject*)0))) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_SetSlice(obj, a, b, value)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice assignment", (obj)->ob_type->tp_name), -1))) + #define __Pyx_PySequence_DelSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1))) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),((char *)(n))) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),((char *)(n))) +#else + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),(n)) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),(n)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_NAMESTR(n) ((char *)(n)) + #define __Pyx_DOCSTR(n) ((char *)(n)) +#else + #define __Pyx_NAMESTR(n) (n) + #define __Pyx_DOCSTR(n) (n) +#endif +#ifndef CYTHON_INLINE + #if defined(__GNUC__) + #define CYTHON_INLINE __inline__ + #elif defined(_MSC_VER) + #define CYTHON_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_INLINE inline + #else + #define CYTHON_INLINE + #endif +#endif +#ifndef CYTHON_RESTRICT + #if defined(__GNUC__) + #define CYTHON_RESTRICT __restrict__ + #elif defined(_MSC_VER) && _MSC_VER >= 1400 + #define CYTHON_RESTRICT __restrict + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_RESTRICT restrict + #else + #define CYTHON_RESTRICT + #endif +#endif +#ifdef NAN +#define __PYX_NAN() ((float) NAN) +#else +static CYTHON_INLINE float __PYX_NAN() { + /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and + a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is + a quiet NaN. */ + float value; + memset(&value, 0xFF, sizeof(value)); + return value; +} +#endif + + +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y) +#else + #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y) +#endif + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#if defined(WIN32) || defined(MS_WINDOWS) +#define _USE_MATH_DEFINES +#endif +#include +#define __PYX_HAVE__spacy__en +#define __PYX_HAVE_API__spacy__en +#include +#include "ios" +#include "new" +#include "stdexcept" +#include "typeinfo" +#include +#include "stdint.h" +#include "sparsehash/dense_hash_map" +#include "string.h" +#include "stdlib.h" +#include "../include/MurmurHash3.h" +#include "../include/MurmurHash2.h" +#ifdef _OPENMP +#include +#endif /* _OPENMP */ + +#ifdef PYREX_WITHOUT_ASSERTIONS +#define CYTHON_WITHOUT_ASSERTIONS +#endif + +#ifndef CYTHON_UNUSED +# if defined(__GNUC__) +# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +#endif +typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding; + const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/ + +#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0 +#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0 +#define __PYX_DEFAULT_STRING_ENCODING "" +#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString +#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#define __Pyx_fits_Py_ssize_t(v, type, is_signed) ( \ + (sizeof(type) < sizeof(Py_ssize_t)) || \ + (sizeof(type) > sizeof(Py_ssize_t) && \ + likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX) && \ + (!is_signed || likely(v > (type)PY_SSIZE_T_MIN || \ + v == (type)PY_SSIZE_T_MIN))) || \ + (sizeof(type) == sizeof(Py_ssize_t) && \ + (is_signed || likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX))) ) +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*); +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length); +#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s)) +#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l) +#define __Pyx_PyBytes_FromString PyBytes_FromString +#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*); +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#else + #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize +#endif +#define __Pyx_PyObject_AsSString(s) ((signed char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_FromUString(s) __Pyx_PyObject_FromString((char*)s) +#define __Pyx_PyBytes_FromUString(s) __Pyx_PyBytes_FromString((char*)s) +#define __Pyx_PyByteArray_FromUString(s) __Pyx_PyByteArray_FromString((char*)s) +#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s) +#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s) +#if PY_MAJOR_VERSION < 3 +static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) +{ + const Py_UNICODE *u_end = u; + while (*u_end++) ; + return u_end - u - 1; +} +#else +#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen +#endif +#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u)) +#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode +#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode +#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None) +#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x); +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*); +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t); +#if CYTHON_COMPILING_IN_CPYTHON +#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x)) +#else +#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x) +#endif +#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x)) +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII +static int __Pyx_sys_getdefaultencoding_not_ascii; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + PyObject* ascii_chars_u = NULL; + PyObject* ascii_chars_b = NULL; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) { + __Pyx_sys_getdefaultencoding_not_ascii = 0; + } else { + const char* default_encoding_c = PyBytes_AS_STRING(default_encoding); + char ascii_chars[128]; + int c; + for (c = 0; c < 128; c++) { + ascii_chars[c] = c; + } + __Pyx_sys_getdefaultencoding_not_ascii = 1; + ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL); + if (ascii_chars_u == NULL) goto bad; + ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL); + if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) { + PyErr_Format( + PyExc_ValueError, + "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.", + default_encoding_c); + goto bad; + } + } + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return -1; +} +#endif +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3 +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL) +#else +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL) +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT +static char* __PYX_DEFAULT_STRING_ENCODING; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + char* default_encoding_c; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + default_encoding_c = PyBytes_AS_STRING(default_encoding); + __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c)); + strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c); + Py_DECREF(sys); + Py_DECREF(default_encoding); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + return -1; +} +#endif +#endif + + +#ifdef __GNUC__ + /* Test for GCC > 2.95 */ + #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) + #else /* __GNUC__ > 2 ... */ + #define likely(x) (x) + #define unlikely(x) (x) + #endif /* __GNUC__ > 2 ... */ +#else /* __GNUC__ */ + #define likely(x) (x) + #define unlikely(x) (x) +#endif /* __GNUC__ */ + +static PyObject *__pyx_m; +static PyObject *__pyx_d; +static PyObject *__pyx_b; +static PyObject *__pyx_empty_tuple; +static PyObject *__pyx_empty_bytes; +static int __pyx_lineno; +static int __pyx_clineno = 0; +static const char * __pyx_cfilenm= __FILE__; +static const char *__pyx_filename; + + +static const char *__pyx_f[] = { + "en.pyx", +}; + +/* "spacy/lexeme.pxd":4 + * + * + * ctypedef int ClusterID # <<<<<<<<<<<<<< + * ctypedef uint64_t StringHash + * + */ +typedef int __pyx_t_5spacy_6lexeme_ClusterID; + +/* "spacy/lexeme.pxd":5 + * + * ctypedef int ClusterID + * ctypedef uint64_t StringHash # <<<<<<<<<<<<<< + * + * + */ +typedef uint64_t __pyx_t_5spacy_6lexeme_StringHash; + +/* "spacy/en.pxd":7 + * + * ctypedef Py_UNICODE* string_ptr + * ctypedef size_t Lexeme_addr # For python interop # <<<<<<<<<<<<<< + * ctypedef Lexeme* Lexeme_ptr + * + */ +typedef size_t __pyx_t_5spacy_2en_Lexeme_addr; + +/*--- Type declarations ---*/ +struct __pyx_t_5spacy_6lexeme_Lexeme; + +/* "spacy/lexeme.pxd":27 + * # over the Lexeme, via: + * # for field in range(LexAttr.n): get_attr(Lexeme*, field) + * cdef enum HashFields: # <<<<<<<<<<<<<< + * sic + * lex + */ +enum __pyx_t_5spacy_6lexeme_HashFields { + __pyx_e_5spacy_6lexeme_sic, + __pyx_e_5spacy_6lexeme_lex, + __pyx_e_5spacy_6lexeme_normed, + __pyx_e_5spacy_6lexeme_cluster, + __pyx_e_5spacy_6lexeme_n +}; + +/* "spacy/lexeme.pxd":8 + * + * + * cdef struct Lexeme: # <<<<<<<<<<<<<< + * StringHash sic # Hash of the original string + * StringHash lex # Hash of the word, with punctuation and clitics split off + */ +struct __pyx_t_5spacy_6lexeme_Lexeme { + __pyx_t_5spacy_6lexeme_StringHash sic; + __pyx_t_5spacy_6lexeme_StringHash lex; + __pyx_t_5spacy_6lexeme_StringHash normed; + __pyx_t_5spacy_6lexeme_StringHash last3; + Py_UNICODE first; + double prob; + __pyx_t_5spacy_6lexeme_ClusterID cluster; + int oft_upper; + int oft_title; + struct __pyx_t_5spacy_6lexeme_Lexeme *tail; +}; + +/* "spacy/en.pxd":6 + * + * + * ctypedef Py_UNICODE* string_ptr # <<<<<<<<<<<<<< + * ctypedef size_t Lexeme_addr # For python interop + * ctypedef Lexeme* Lexeme_ptr + */ +typedef Py_UNICODE *__pyx_t_5spacy_2en_string_ptr; + +/* "spacy/en.pxd":8 + * ctypedef Py_UNICODE* string_ptr + * ctypedef size_t Lexeme_addr # For python interop + * ctypedef Lexeme* Lexeme_ptr # <<<<<<<<<<<<<< + * + * + */ +typedef struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_5spacy_2en_Lexeme_ptr; +#ifndef CYTHON_REFNANNY + #define CYTHON_REFNANNY 0 +#endif +#if CYTHON_REFNANNY + typedef struct { + void (*INCREF)(void*, PyObject*, int); + void (*DECREF)(void*, PyObject*, int); + void (*GOTREF)(void*, PyObject*, int); + void (*GIVEREF)(void*, PyObject*, int); + void* (*SetupContext)(const char*, int, const char*); + void (*FinishContext)(void**); + } __Pyx_RefNannyAPIStruct; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); /*proto*/ + #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL; +#ifdef WITH_THREAD + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + if (acquire_gil) { \ + PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure(); \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + PyGILState_Release(__pyx_gilstate_save); \ + } else { \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + } +#else + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__) +#endif + #define __Pyx_RefNannyFinishContext() \ + __Pyx_RefNanny->FinishContext(&__pyx_refnanny) + #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0) + #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0) + #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0) + #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0) +#else + #define __Pyx_RefNannyDeclarations + #define __Pyx_RefNannySetupContext(name, acquire_gil) + #define __Pyx_RefNannyFinishContext() + #define __Pyx_INCREF(r) Py_INCREF(r) + #define __Pyx_DECREF(r) Py_DECREF(r) + #define __Pyx_GOTREF(r) + #define __Pyx_GIVEREF(r) + #define __Pyx_XINCREF(r) Py_XINCREF(r) + #define __Pyx_XDECREF(r) Py_XDECREF(r) + #define __Pyx_XGOTREF(r) + #define __Pyx_XGIVEREF(r) +#endif /* CYTHON_REFNANNY */ +#define __Pyx_XDECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_XDECREF(tmp); \ + } while (0) +#define __Pyx_DECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_DECREF(tmp); \ + } while (0) +#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0) +#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0) + +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) { + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_getattro)) + return tp->tp_getattro(obj, attr_name); +#if PY_MAJOR_VERSION < 3 + if (likely(tp->tp_getattr)) + return tp->tp_getattr(obj, PyString_AS_STRING(attr_name)); +#endif + return PyObject_GetAttr(obj, attr_name); +} +#else +#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n) +#endif + +static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/ + +#include + +static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ + +static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ + +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact); /*proto*/ + +static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact, + Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); /*proto*/ + +static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name); /*proto*/ + +static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[], \ + PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args, \ + const char* function_name); /*proto*/ + +static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); /*proto*/ + +#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \ + __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) : \ + (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) : \ + __Pyx_GetItemInt_Generic(o, to_py_func(i)))) +#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \ + __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) : \ + (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL)) +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck); +#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \ + __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) : \ + (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL)) +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck); +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j); +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, + int is_list, int wraparound, int boundscheck); + +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); /*proto*/ +#else +#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw) +#endif + +static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb); /*proto*/ +static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb); /*proto*/ + +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause); /*proto*/ + +#define __Pyx_GetItemInt_Unicode(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \ + __Pyx_GetItemInt_Unicode_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) : \ + (PyErr_SetString(PyExc_IndexError, "string index out of range"), (Py_UCS4)-1)) +static CYTHON_INLINE Py_UCS4 __Pyx_GetItemInt_Unicode_Fast(PyObject* ustring, Py_ssize_t i, + int wraparound, int boundscheck); + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_Substring( + PyObject* text, Py_ssize_t start, Py_ssize_t stop); + +#define __Pyx_SetItemInt(o, i, v, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \ + (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \ + __Pyx_SetItemInt_Fast(o, (Py_ssize_t)i, v, is_list, wraparound, boundscheck) : \ + (is_list ? (PyErr_SetString(PyExc_IndexError, "list assignment index out of range"), -1) : \ + __Pyx_SetItemInt_Generic(o, to_py_func(i), v))) +static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v); +static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, + int is_list, int wraparound, int boundscheck); + +static void __Pyx_WriteUnraisable(const char *name, int clineno, + int lineno, const char *filename, + int full_traceback); /*proto*/ + +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); + +static CYTHON_INLINE uint64_t __Pyx_PyInt_As_uint64_t(PyObject *); + +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *); + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_uint64_t(uint64_t value); + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); + +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *); + +static int __Pyx_check_binary_version(void); + +#if CYTHON_COMPILING_IN_CPYTHON +#define __Pyx_PyObject_DelAttrStr(o,n) __Pyx_PyObject_SetAttrStr(o,n,NULL) +static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value) { + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_setattro)) + return tp->tp_setattro(obj, attr_name, value); +#if PY_MAJOR_VERSION < 3 + if (likely(tp->tp_setattr)) + return tp->tp_setattr(obj, PyString_AS_STRING(attr_name), value); +#endif + return PyObject_SetAttr(obj, attr_name, value); +} +#else +#define __Pyx_PyObject_DelAttrStr(o,n) PyObject_DelAttr(o,n) +#define __Pyx_PyObject_SetAttrStr(o,n,v) PyObject_SetAttr(o,n,v) +#endif + +static int __Pyx_ExportVoidPtr(PyObject *name, void *p, const char *sig); /*proto*/ + +static int __Pyx_ExportFunction(const char *name, void (*f)(void), const char *sig); /*proto*/ + +typedef struct { + int code_line; + PyCodeObject* code_object; +} __Pyx_CodeObjectCacheEntry; +struct __Pyx_CodeObjectCache { + int count; + int max_count; + __Pyx_CodeObjectCacheEntry* entries; +}; +static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL}; +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line); +static PyCodeObject *__pyx_find_code_object(int code_line); +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object); + +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename); /*proto*/ + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ + + +/* Module declarations from 'libcpp.utility' */ + +/* Module declarations from 'libcpp.vector' */ + +/* Module declarations from 'libc.stdint' */ + +/* Module declarations from 'ext.sparsehash' */ + +/* Module declarations from 'spacy.lexeme' */ + +/* Module declarations from 'libc.string' */ + +/* Module declarations from 'libc.stdlib' */ + +/* Module declarations from 'ext.murmurhash' */ + +/* Module declarations from 'spacy.en' */ +static google::dense_hash_map<__pyx_t_5spacy_6lexeme_StringHash,__pyx_t_5spacy_2en_Lexeme_ptr> __pyx_v_5spacy_2en_LEXEMES; +static struct __pyx_t_5spacy_6lexeme_Lexeme __pyx_v_5spacy_2en_BLANK_WORD; +static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *, int __pyx_skip_dispatch); /*proto*/ +static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject *, int, int, int __pyx_skip_dispatch); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject *, size_t); /*proto*/ +static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash, int __pyx_skip_dispatch); /*proto*/ +static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *); /*proto*/ +static PyObject *__pyx_f_5spacy_2en__substr(PyObject *, int, int, size_t, int __pyx_skip_dispatch); /*proto*/ +static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5spacy_6lexeme_StringHash, PyObject *, int, size_t); /*proto*/ +static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyObject *, __pyx_t_5spacy_6lexeme_StringHash, int, size_t); /*proto*/ +static size_t __pyx_f_5spacy_2en__find_split(PyObject *, size_t); /*proto*/ +#define __Pyx_MODULE_NAME "spacy.en" +int __pyx_module_is_main_spacy__en = 0; + +/* Implementation of 'spacy.en' */ +static PyObject *__pyx_builtin_ValueError; +static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length); /* proto */ +static char __pyx_k_[] = ""; +static char __pyx_k_end[] = "end"; +static char __pyx_k_lex[] = "lex"; +static char __pyx_k_sic[] = "sic"; +static char __pyx_k_YEAR[] = "!YEAR"; +static char __pyx_k_main[] = "__main__"; +static char __pyx_k_prob[] = "prob"; +static char __pyx_k_tail[] = "tail"; +static char __pyx_k_test[] = "__test__"; +static char __pyx_k_first[] = "first"; +static char __pyx_k_last3[] = "last3"; +static char __pyx_k_lower[] = "lower"; +static char __pyx_k_start[] = "start"; +static char __pyx_k_DIGITS[] = "!DIGITS"; +static char __pyx_k_length[] = "length"; +static char __pyx_k_normed[] = "normed"; +static char __pyx_k_string[] = "string"; +static char __pyx_k_LEXEMES[] = "LEXEMES"; +static char __pyx_k_STRINGS[] = "STRINGS"; +static char __pyx_k_cluster[] = "cluster"; +static char __pyx_k_isdigit[] = "isdigit"; +static char __pyx_k_pyx_capi[] = "__pyx_capi__"; +static char __pyx_k_oft_title[] = "oft_title"; +static char __pyx_k_oft_upper[] = "oft_upper"; +static char __pyx_k_ValueError[] = "ValueError"; +static char __pyx_k_Serve_pointers_to_Lexeme_structs[] = "Serve pointers to Lexeme structs, given strings. Maintain a reverse index,\nso that strings can be retrieved from hashes. Use 64-bit hash values and\nboldly assume no collisions.\n"; +static PyObject *__pyx_kp_u_; +static PyObject *__pyx_kp_u_DIGITS; +static PyObject *__pyx_n_s_LEXEMES; +static PyObject *__pyx_n_s_STRINGS; +static PyObject *__pyx_n_s_ValueError; +static PyObject *__pyx_kp_u_YEAR; +static PyObject *__pyx_n_s_cluster; +static PyObject *__pyx_n_s_end; +static PyObject *__pyx_n_s_first; +static PyObject *__pyx_n_s_isdigit; +static PyObject *__pyx_n_s_last3; +static PyObject *__pyx_n_s_length; +static PyObject *__pyx_n_s_lex; +static PyObject *__pyx_n_s_lower; +static PyObject *__pyx_n_s_main; +static PyObject *__pyx_n_s_normed; +static PyObject *__pyx_n_s_oft_title; +static PyObject *__pyx_n_s_oft_upper; +static PyObject *__pyx_n_s_prob; +static PyObject *__pyx_n_s_pyx_capi; +static PyObject *__pyx_n_s_sic; +static PyObject *__pyx_n_s_start; +static PyObject *__pyx_n_s_string; +static PyObject *__pyx_n_s_tail; +static PyObject *__pyx_n_s_test; + +/* "spacy/en.pyx":23 + * + * + * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< + * '''.. function:: enumerate(sequence[, start=0]) + * Fetch a Lexeme representing a word string. If the word has not been seen, + */ + +static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_v_string, CYTHON_UNUSED int __pyx_skip_dispatch) { + size_t __pyx_v_length; + __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_word_ptr; + __pyx_t_5spacy_2en_Lexeme_addr __pyx_r; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + Py_ssize_t __pyx_t_3; + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_4; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_5; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("lookup", 0); + + /* "spacy/en.pyx":31 + * To specify the boundaries of the word if it has not been seen, use lookup_chunk. + * ''' + * if string == '': # <<<<<<<<<<<<<< + * return &BLANK_WORD + * cdef size_t length = len(string) + */ + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = (__pyx_t_1 != 0); + if (__pyx_t_2) { + + /* "spacy/en.pyx":32 + * ''' + * if string == '': + * return &BLANK_WORD # <<<<<<<<<<<<<< + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) + */ + __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)(&__pyx_v_5spacy_2en_BLANK_WORD)); + goto __pyx_L0; + } + + /* "spacy/en.pyx":33 + * if string == '': + * return &BLANK_WORD + * cdef size_t length = len(string) # <<<<<<<<<<<<<< + * cdef StringHash hashed = hash_string(string, length) + * cdef Lexeme* word_ptr = LEXEMES[hashed] + */ + if (unlikely(__pyx_v_string == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_length = __pyx_t_3; + + /* "spacy/en.pyx":34 + * return &BLANK_WORD + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< + * cdef Lexeme* word_ptr = LEXEMES[hashed] + * cdef size_t n + */ + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_hashed = __pyx_t_4; + + /* "spacy/en.pyx":35 + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) + * cdef Lexeme* word_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< + * cdef size_t n + * if word_ptr == NULL: + */ + __pyx_v_word_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); + + /* "spacy/en.pyx":37 + * cdef Lexeme* word_ptr = LEXEMES[hashed] + * cdef size_t n + * if word_ptr == NULL: # <<<<<<<<<<<<<< + * word_ptr = _add(hashed, string, _find_split(string, length), length) + * return word_ptr + */ + __pyx_t_2 = ((__pyx_v_word_ptr == NULL) != 0); + if (__pyx_t_2) { + + /* "spacy/en.pyx":38 + * cdef size_t n + * if word_ptr == NULL: + * word_ptr = _add(hashed, string, _find_split(string, length), length) # <<<<<<<<<<<<<< + * return word_ptr + * + */ + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_f_5spacy_2en__find_split(__pyx_v_string, __pyx_v_length), __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word_ptr = __pyx_t_5; + goto __pyx_L4; + } + __pyx_L4:; + + /* "spacy/en.pyx":39 + * if word_ptr == NULL: + * word_ptr = _add(hashed, string, _find_split(string, length), length) + * return word_ptr # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_word_ptr); + goto __pyx_L0; + + /* "spacy/en.pyx":23 + * + * + * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< + * '''.. function:: enumerate(sequence[, start=0]) + * Fetch a Lexeme representing a word string. If the word has not been seen, + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_AddTraceback("spacy.en.lookup", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static char __pyx_doc_5spacy_2en_lookup[] = ".. function:: enumerate(sequence[, start=0])\n Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, splitting off any attached punctuation or clitics. A\n reference to BLANK_WORD is returned for the empty string.\n \n To specify the boundaries of the word if it has not been seen, use lookup_chunk.\n "; +static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string) { + CYTHON_UNUSED int __pyx_lineno = 0; + CYTHON_UNUSED const char *__pyx_filename = NULL; + CYTHON_UNUSED int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("lookup (wrapper)", 0); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_lookup(__pyx_self, ((PyObject*)__pyx_v_string)); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("lookup", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_2en_lookup(__pyx_v_string, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.en.lookup", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":42 + * + * + * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< + * '''Fetch a Lexeme representing a word string. If the word has not been seen, + * construct one, given the specified start and end indices. A negative index + */ + +static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject *__pyx_v_string, int __pyx_v_start, CYTHON_UNUSED int __pyx_v_end, CYTHON_UNUSED int __pyx_skip_dispatch) { + size_t __pyx_v_length; + __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_chunk_ptr; + __pyx_t_5spacy_2en_Lexeme_addr __pyx_r; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + Py_ssize_t __pyx_t_3; + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_4; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_5; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("lookup_chunk", 0); + + /* "spacy/en.pyx":50 + * A reference to BLANK_WORD is returned for the empty string. + * ''' + * if string == '': # <<<<<<<<<<<<<< + * return &BLANK_WORD + * cdef size_t length = len(string) + */ + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = (__pyx_t_1 != 0); + if (__pyx_t_2) { + + /* "spacy/en.pyx":51 + * ''' + * if string == '': + * return &BLANK_WORD # <<<<<<<<<<<<<< + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) + */ + __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)(&__pyx_v_5spacy_2en_BLANK_WORD)); + goto __pyx_L0; + } + + /* "spacy/en.pyx":52 + * if string == '': + * return &BLANK_WORD + * cdef size_t length = len(string) # <<<<<<<<<<<<<< + * cdef StringHash hashed = hash_string(string, length) + * cdef Lexeme* chunk_ptr = LEXEMES[hashed] + */ + if (unlikely(__pyx_v_string == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_length = __pyx_t_3; + + /* "spacy/en.pyx":53 + * return &BLANK_WORD + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< + * cdef Lexeme* chunk_ptr = LEXEMES[hashed] + * if chunk_ptr == NULL: + */ + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_hashed = __pyx_t_4; + + /* "spacy/en.pyx":54 + * cdef size_t length = len(string) + * cdef StringHash hashed = hash_string(string, length) + * cdef Lexeme* chunk_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< + * if chunk_ptr == NULL: + * chunk_ptr = _add(hashed, string, start, length) + */ + __pyx_v_chunk_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); + + /* "spacy/en.pyx":55 + * cdef StringHash hashed = hash_string(string, length) + * cdef Lexeme* chunk_ptr = LEXEMES[hashed] + * if chunk_ptr == NULL: # <<<<<<<<<<<<<< + * chunk_ptr = _add(hashed, string, start, length) + * return chunk_ptr + */ + __pyx_t_2 = ((__pyx_v_chunk_ptr == NULL) != 0); + if (__pyx_t_2) { + + /* "spacy/en.pyx":56 + * cdef Lexeme* chunk_ptr = LEXEMES[hashed] + * if chunk_ptr == NULL: + * chunk_ptr = _add(hashed, string, start, length) # <<<<<<<<<<<<<< + * return chunk_ptr + * + */ + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_v_start, __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_chunk_ptr = __pyx_t_5; + goto __pyx_L4; + } + __pyx_L4:; + + /* "spacy/en.pyx":57 + * if chunk_ptr == NULL: + * chunk_ptr = _add(hashed, string, start, length) + * return chunk_ptr # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_chunk_ptr); + goto __pyx_L0; + + /* "spacy/en.pyx":42 + * + * + * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< + * '''Fetch a Lexeme representing a word string. If the word has not been seen, + * construct one, given the specified start and end indices. A negative index + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_AddTraceback("spacy.en.lookup_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5spacy_2en_2lookup_chunk[] = "Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, given the specified start and end indices. A negative index\n significes 0 for start, and the string length for end --- i.e. the string\n will not be sliced if start == -1 and end == -1.\n \n A reference to BLANK_WORD is returned for the empty string.\n "; +static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_string = 0; + int __pyx_v_start; + int __pyx_v_end; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("lookup_chunk (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_string,&__pyx_n_s_start,&__pyx_n_s_end,0}; + PyObject* values[3] = {0,0,0}; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_string)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + case 1: + if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + case 2: + if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "lookup_chunk") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { + goto __pyx_L5_argtuple_error; + } else { + values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + } + __pyx_v_string = ((PyObject*)values[0]); + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.en.lookup_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_2lookup_chunk(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("lookup_chunk", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_2en_lookup_chunk(__pyx_v_string, __pyx_v_start, __pyx_v_end, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.en.lookup_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":60 + * + * + * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< + * '''Hash unicode with MurmurHash64A''' + * assert length + */ + +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject *__pyx_v_s, size_t __pyx_v_length) { + __pyx_t_5spacy_6lexeme_StringHash __pyx_r; + __Pyx_RefNannyDeclarations + __pyx_t_5spacy_2en_string_ptr __pyx_t_1; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("hash_string", 0); + + /* "spacy/en.pyx":62 + * cdef StringHash hash_string(unicode s, size_t length) except 0: + * '''Hash unicode with MurmurHash64A''' + * assert length # <<<<<<<<<<<<<< + * return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) + * + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + if (unlikely(!(__pyx_v_length != 0))) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":63 + * '''Hash unicode with MurmurHash64A''' + * assert length + * return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) # <<<<<<<<<<<<<< + * + * + */ + __pyx_t_1 = __Pyx_PyUnicode_AsUnicode(__pyx_v_s); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = MurmurHash64A(((__pyx_t_5spacy_2en_string_ptr)__pyx_t_1), (__pyx_v_length * (sizeof(Py_UNICODE))), 0); + goto __pyx_L0; + + /* "spacy/en.pyx":60 + * + * + * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< + * '''Hash unicode with MurmurHash64A''' + * assert length + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_AddTraceback("spacy.en.hash_string", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":66 + * + * + * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< + * '''Fetch a string from the reverse index, given its hash value.''' + * cdef string_ptr string = STRINGS[hash_value] + */ + +static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value, CYTHON_UNUSED int __pyx_skip_dispatch) { + __pyx_t_5spacy_2en_string_ptr __pyx_v_string; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + __pyx_t_5spacy_2en_string_ptr __pyx_t_3; + int __pyx_t_4; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("unhash", 0); + + /* "spacy/en.pyx":68 + * cpdef unicode unhash(StringHash hash_value): + * '''Fetch a string from the reverse index, given its hash value.''' + * cdef string_ptr string = STRINGS[hash_value] # <<<<<<<<<<<<<< + * if string == NULL: + * raise ValueError(hash_value) + */ + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_hash_value, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_3 = __Pyx_PyUnicode_AsUnicode(__pyx_t_2); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_v_string = __pyx_t_3; + + /* "spacy/en.pyx":69 + * '''Fetch a string from the reverse index, given its hash value.''' + * cdef string_ptr string = STRINGS[hash_value] + * if string == NULL: # <<<<<<<<<<<<<< + * raise ValueError(hash_value) + * + */ + __pyx_t_4 = ((__pyx_v_string == NULL) != 0); + if (__pyx_t_4) { + + /* "spacy/en.pyx":70 + * cdef string_ptr string = STRINGS[hash_value] + * if string == NULL: + * raise ValueError(hash_value) # <<<<<<<<<<<<<< + * + * return string + */ + __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_v_hash_value); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2); + __Pyx_GIVEREF(__pyx_t_2); + __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_Raise(__pyx_t_2, 0, 0, 0); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + + /* "spacy/en.pyx":72 + * raise ValueError(hash_value) + * + * return string # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_2 = __Pyx_PyUnicode_FromUnicode(__pyx_v_string); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + if (!(likely(PyUnicode_CheckExact(__pyx_t_2))||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = ((PyObject*)__pyx_t_2); + __pyx_t_2 = 0; + goto __pyx_L0; + + /* "spacy/en.pyx":66 + * + * + * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< + * '''Fetch a string from the reverse index, given its hash value.''' + * cdef string_ptr string = STRINGS[hash_value] + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.en.unhash", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static char __pyx_doc_5spacy_2en_4unhash[] = "Fetch a string from the reverse index, given its hash value."; +static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value) { + __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("unhash (wrapper)", 0); + assert(__pyx_arg_hash_value); { + __pyx_v_hash_value = __Pyx_PyInt_As_uint64_t(__pyx_arg_hash_value); if (unlikely((__pyx_v_hash_value == (uint64_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.en.unhash", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_2en_4unhash(__pyx_self, ((__pyx_t_5spacy_6lexeme_StringHash)__pyx_v_hash_value)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("unhash", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_2en_unhash(__pyx_v_hash_value, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.en.unhash", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":75 + * + * + * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< + * '''Return a normalized version of the word, mapping: + * - 4 digit strings into !YEAR + */ + +static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + int __pyx_t_3; + Py_ssize_t __pyx_t_4; + int __pyx_t_5; + int __pyx_t_6; + Py_UCS4 __pyx_t_7; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("normalize_word_string", 0); + + /* "spacy/en.pyx":82 + * ''' + * cdef unicode s + * if word.isdigit() and len(word) == 4: # <<<<<<<<<<<<<< + * return '!YEAR' + * elif word[0].isdigit(): + */ + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_isdigit); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (__pyx_t_3) { + if (unlikely(__pyx_v_word == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_4 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_word); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = (__pyx_t_4 == 4); + __pyx_t_6 = __pyx_t_5; + } else { + __pyx_t_6 = __pyx_t_3; + } + if (__pyx_t_6) { + + /* "spacy/en.pyx":83 + * cdef unicode s + * if word.isdigit() and len(word) == 4: + * return '!YEAR' # <<<<<<<<<<<<<< + * elif word[0].isdigit(): + * return '!DIGITS' + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_kp_u_YEAR); + __pyx_r = __pyx_kp_u_YEAR; + goto __pyx_L0; + } + + /* "spacy/en.pyx":84 + * if word.isdigit() and len(word) == 4: + * return '!YEAR' + * elif word[0].isdigit(): # <<<<<<<<<<<<<< + * return '!DIGITS' + * else: + */ + __pyx_t_7 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_7 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_6 = Py_UNICODE_ISDIGIT(__pyx_t_7); + if ((__pyx_t_6 != 0)) { + + /* "spacy/en.pyx":85 + * return '!YEAR' + * elif word[0].isdigit(): + * return '!DIGITS' # <<<<<<<<<<<<<< + * else: + * return word.lower() + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_kp_u_DIGITS); + __pyx_r = __pyx_kp_u_DIGITS; + goto __pyx_L0; + } + /*else*/ { + + /* "spacy/en.pyx":87 + * return '!DIGITS' + * else: + * return word.lower() # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (!(likely(PyUnicode_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + goto __pyx_L0; + } + + /* "spacy/en.pyx":75 + * + * + * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< + * '''Return a normalized version of the word, mapping: + * - 4 digit strings into !YEAR + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.en.normalize_word_string", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":90 + * + * + * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< + * if end >= length: + * end = -1 + */ + +static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length, CYTHON_UNUSED int __pyx_skip_dispatch) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + int __pyx_t_3; + PyObject *__pyx_t_4 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("_substr", 0); + + /* "spacy/en.pyx":91 + * + * cpdef unicode _substr(unicode string, int start, int end, size_t length): + * if end >= length: # <<<<<<<<<<<<<< + * end = -1 + * if start >= length: + */ + __pyx_t_1 = ((__pyx_v_end >= __pyx_v_length) != 0); + if (__pyx_t_1) { + + /* "spacy/en.pyx":92 + * cpdef unicode _substr(unicode string, int start, int end, size_t length): + * if end >= length: + * end = -1 # <<<<<<<<<<<<<< + * if start >= length: + * start = 0 + */ + __pyx_v_end = -1; + goto __pyx_L3; + } + __pyx_L3:; + + /* "spacy/en.pyx":93 + * if end >= length: + * end = -1 + * if start >= length: # <<<<<<<<<<<<<< + * start = 0 + * if start <= 0 and end < 0: + */ + __pyx_t_1 = ((__pyx_v_start >= __pyx_v_length) != 0); + if (__pyx_t_1) { + + /* "spacy/en.pyx":94 + * end = -1 + * if start >= length: + * start = 0 # <<<<<<<<<<<<<< + * if start <= 0 and end < 0: + * return string + */ + __pyx_v_start = 0; + goto __pyx_L4; + } + __pyx_L4:; + + /* "spacy/en.pyx":95 + * if start >= length: + * start = 0 + * if start <= 0 and end < 0: # <<<<<<<<<<<<<< + * return string + * elif start < 0: + */ + __pyx_t_1 = ((__pyx_v_start <= 0) != 0); + if (__pyx_t_1) { + __pyx_t_2 = ((__pyx_v_end < 0) != 0); + __pyx_t_3 = __pyx_t_2; + } else { + __pyx_t_3 = __pyx_t_1; + } + if (__pyx_t_3) { + + /* "spacy/en.pyx":96 + * start = 0 + * if start <= 0 and end < 0: + * return string # <<<<<<<<<<<<<< + * elif start < 0: + * start = 0 + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_v_string); + __pyx_r = __pyx_v_string; + goto __pyx_L0; + } + + /* "spacy/en.pyx":97 + * if start <= 0 and end < 0: + * return string + * elif start < 0: # <<<<<<<<<<<<<< + * start = 0 + * elif end < 0: + */ + __pyx_t_3 = ((__pyx_v_start < 0) != 0); + if (__pyx_t_3) { + + /* "spacy/en.pyx":98 + * return string + * elif start < 0: + * start = 0 # <<<<<<<<<<<<<< + * elif end < 0: + * end = length + */ + __pyx_v_start = 0; + goto __pyx_L5; + } + + /* "spacy/en.pyx":99 + * elif start < 0: + * start = 0 + * elif end < 0: # <<<<<<<<<<<<<< + * end = length + * return string[start:end] + */ + __pyx_t_3 = ((__pyx_v_end < 0) != 0); + if (__pyx_t_3) { + + /* "spacy/en.pyx":100 + * start = 0 + * elif end < 0: + * end = length # <<<<<<<<<<<<<< + * return string[start:end] + * + */ + __pyx_v_end = __pyx_v_length; + goto __pyx_L5; + } + __pyx_L5:; + + /* "spacy/en.pyx":101 + * elif end < 0: + * end = length + * return string[start:end] # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + if (unlikely(__pyx_v_string == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_4 = __Pyx_PyUnicode_Substring(__pyx_v_string, __pyx_v_start, __pyx_v_end); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_4); + __pyx_r = ((PyObject*)__pyx_t_4); + __pyx_t_4 = 0; + goto __pyx_L0; + + /* "spacy/en.pyx":90 + * + * + * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< + * if end >= length: + * end = -1 + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_4); + __Pyx_AddTraceback("spacy.en._substr", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_string = 0; + int __pyx_v_start; + int __pyx_v_end; + size_t __pyx_v_length; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("_substr (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_string,&__pyx_n_s_start,&__pyx_n_s_end,&__pyx_n_s_length,0}; + PyObject* values[4] = {0,0,0,0}; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3); + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_string)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + case 1: + if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + case 2: + if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + case 3: + if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_length)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_substr") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + } else if (PyTuple_GET_SIZE(__pyx_args) != 4) { + goto __pyx_L5_argtuple_error; + } else { + values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + values[3] = PyTuple_GET_ITEM(__pyx_args, 3); + } + __pyx_v_string = ((PyObject*)values[0]); + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_length = __Pyx_PyInt_As_size_t(values[3]); if (unlikely((__pyx_v_length == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.en._substr", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_6_substr(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("_substr", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.en._substr", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":104 + * + * + * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< + * assert string + * assert split <= length + */ + +static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed, PyObject *__pyx_v_string, int __pyx_v_split, size_t __pyx_v_length) { + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_word; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_r; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_2; + PyObject *__pyx_t_3 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("_add", 0); + + /* "spacy/en.pyx":105 + * + * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: + * assert string # <<<<<<<<<<<<<< + * assert split <= length + * word = _init_lexeme(string, hashed, split, length) + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + __pyx_t_1 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); + if (unlikely(!__pyx_t_1)) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":106 + * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: + * assert string + * assert split <= length # <<<<<<<<<<<<<< + * word = _init_lexeme(string, hashed, split, length) + * LEXEMES[hashed] = word + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 106; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":107 + * assert string + * assert split <= length + * word = _init_lexeme(string, hashed, split, length) # <<<<<<<<<<<<<< + * LEXEMES[hashed] = word + * STRINGS[hashed] = string + */ + __pyx_t_2 = __pyx_f_5spacy_2en__init_lexeme(__pyx_v_string, __pyx_v_hashed, __pyx_v_split, __pyx_v_length); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 107; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word = __pyx_t_2; + + /* "spacy/en.pyx":108 + * assert split <= length + * word = _init_lexeme(string, hashed, split, length) + * LEXEMES[hashed] = word # <<<<<<<<<<<<<< + * STRINGS[hashed] = string + * return word + */ + (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]) = __pyx_v_word; + + /* "spacy/en.pyx":109 + * word = _init_lexeme(string, hashed, split, length) + * LEXEMES[hashed] = word + * STRINGS[hashed] = string # <<<<<<<<<<<<<< + * return word + * + */ + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 109; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_3); + if (unlikely(__Pyx_SetItemInt(__pyx_t_3, __pyx_v_hashed, __pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 109; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + + /* "spacy/en.pyx":110 + * LEXEMES[hashed] = word + * STRINGS[hashed] = string + * return word # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = __pyx_v_word; + goto __pyx_L0; + + /* "spacy/en.pyx":104 + * + * + * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< + * assert string + * assert split <= length + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_AddTraceback("spacy.en._add", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":113 + * + * + * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< + * int split, size_t length) except NULL: + * assert split <= length + */ + +static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyObject *__pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed, int __pyx_v_split, size_t __pyx_v_length) { + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_word; + PyObject *__pyx_v_tail_string = 0; + PyObject *__pyx_v_lex = 0; + PyObject *__pyx_v_normed = 0; + PyObject *__pyx_v_last3 = 0; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_r; + __Pyx_RefNannyDeclarations + long __pyx_t_1; + int __pyx_t_2; + Py_UCS4 __pyx_t_3; + int __pyx_t_4; + int __pyx_t_5; + PyObject *__pyx_t_6 = NULL; + Py_ssize_t __pyx_t_7; + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_8; + __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_9; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("_init_lexeme", 0); + + /* "spacy/en.pyx":115 + * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, + * int split, size_t length) except NULL: + * assert split <= length # <<<<<<<<<<<<<< + * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + * + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 115; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":116 + * int split, size_t length) except NULL: + * assert split <= length + * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) # <<<<<<<<<<<<<< + * + * word.first = (string[0] if string else 0) + */ + __pyx_v_word = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)calloc(1, (sizeof(struct __pyx_t_5spacy_6lexeme_Lexeme)))); + + /* "spacy/en.pyx":118 + * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + * + * word.first = (string[0] if string else 0) # <<<<<<<<<<<<<< + * word.sic = hashed + * + */ + __pyx_t_2 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); + if (__pyx_t_2) { + __pyx_t_3 = __Pyx_GetItemInt_Unicode(__pyx_v_string, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_3 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 118; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_1 = __pyx_t_3; + } else { + __pyx_t_1 = 0; + } + __pyx_v_word->first = ((Py_UNICODE)__pyx_t_1); + + /* "spacy/en.pyx":119 + * + * word.first = (string[0] if string else 0) + * word.sic = hashed # <<<<<<<<<<<<<< + * + * cdef unicode tail_string + */ + __pyx_v_word->sic = __pyx_v_hashed; + + /* "spacy/en.pyx":123 + * cdef unicode tail_string + * cdef unicode lex + * if split != 0 and split < length: # <<<<<<<<<<<<<< + * lex = _substr(string, 0, split, length) + * tail_string = _substr(string, split, length, length) + */ + __pyx_t_2 = ((__pyx_v_split != 0) != 0); + if (__pyx_t_2) { + __pyx_t_4 = ((__pyx_v_split < __pyx_v_length) != 0); + __pyx_t_5 = __pyx_t_4; + } else { + __pyx_t_5 = __pyx_t_2; + } + if (__pyx_t_5) { + + /* "spacy/en.pyx":124 + * cdef unicode lex + * if split != 0 and split < length: + * lex = _substr(string, 0, split, length) # <<<<<<<<<<<<<< + * tail_string = _substr(string, split, length, length) + * else: + */ + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, 0, __pyx_v_split, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 124; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __pyx_v_lex = ((PyObject*)__pyx_t_6); + __pyx_t_6 = 0; + + /* "spacy/en.pyx":125 + * if split != 0 and split < length: + * lex = _substr(string, 0, split, length) + * tail_string = _substr(string, split, length, length) # <<<<<<<<<<<<<< + * else: + * lex = string + */ + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_split, __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 125; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __pyx_v_tail_string = ((PyObject*)__pyx_t_6); + __pyx_t_6 = 0; + goto __pyx_L3; + } + /*else*/ { + + /* "spacy/en.pyx":127 + * tail_string = _substr(string, split, length, length) + * else: + * lex = string # <<<<<<<<<<<<<< + * tail_string = '' + * assert lex + */ + __Pyx_INCREF(__pyx_v_string); + __pyx_v_lex = __pyx_v_string; + + /* "spacy/en.pyx":128 + * else: + * lex = string + * tail_string = '' # <<<<<<<<<<<<<< + * assert lex + * cdef unicode normed = normalize_word_string(lex) + */ + __Pyx_INCREF(__pyx_kp_u_); + __pyx_v_tail_string = __pyx_kp_u_; + } + __pyx_L3:; + + /* "spacy/en.pyx":129 + * lex = string + * tail_string = '' + * assert lex # <<<<<<<<<<<<<< + * cdef unicode normed = normalize_word_string(lex) + * cdef unicode last3 = _substr(string, length - 3, length, length) + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + __pyx_t_5 = (__pyx_v_lex != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_lex) != 0); + if (unlikely(!__pyx_t_5)) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 129; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":130 + * tail_string = '' + * assert lex + * cdef unicode normed = normalize_word_string(lex) # <<<<<<<<<<<<<< + * cdef unicode last3 = _substr(string, length - 3, length, length) + * + */ + __pyx_t_6 = __pyx_f_5spacy_2en_normalize_word_string(__pyx_v_lex); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __pyx_v_normed = ((PyObject*)__pyx_t_6); + __pyx_t_6 = 0; + + /* "spacy/en.pyx":131 + * assert lex + * cdef unicode normed = normalize_word_string(lex) + * cdef unicode last3 = _substr(string, length - 3, length, length) # <<<<<<<<<<<<<< + * + * assert normed + */ + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, (__pyx_v_length - 3), __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 131; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __pyx_v_last3 = ((PyObject*)__pyx_t_6); + __pyx_t_6 = 0; + + /* "spacy/en.pyx":133 + * cdef unicode last3 = _substr(string, length - 3, length, length) + * + * assert normed # <<<<<<<<<<<<<< + * assert len(normed) + * + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + __pyx_t_5 = (__pyx_v_normed != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_normed) != 0); + if (unlikely(!__pyx_t_5)) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 133; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":134 + * + * assert normed + * assert len(normed) # <<<<<<<<<<<<<< + * + * word.lex = hash_string(lex, len(lex)) + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + if (unlikely(__pyx_v_normed == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(!(__pyx_t_7 != 0))) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":136 + * assert len(normed) + * + * word.lex = hash_string(lex, len(lex)) # <<<<<<<<<<<<<< + * word.normed = hash_string(normed, len(normed)) + * word.last3 = hash_string(last3, len(last3)) + */ + if (unlikely(__pyx_v_lex == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_lex); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_lex, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word->lex = __pyx_t_8; + + /* "spacy/en.pyx":137 + * + * word.lex = hash_string(lex, len(lex)) + * word.normed = hash_string(normed, len(normed)) # <<<<<<<<<<<<<< + * word.last3 = hash_string(last3, len(last3)) + * + */ + if (unlikely(__pyx_v_normed == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_normed, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word->normed = __pyx_t_8; + + /* "spacy/en.pyx":138 + * word.lex = hash_string(lex, len(lex)) + * word.normed = hash_string(normed, len(normed)) + * word.last3 = hash_string(last3, len(last3)) # <<<<<<<<<<<<<< + * + * STRINGS[word.lex] = lex + */ + if (unlikely(__pyx_v_last3 == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_last3); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_last3, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word->last3 = __pyx_t_8; + + /* "spacy/en.pyx":140 + * word.last3 = hash_string(last3, len(last3)) + * + * STRINGS[word.lex] = lex # <<<<<<<<<<<<<< + * STRINGS[word.normed] = normed + * STRINGS[word.last3] = last3 + */ + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 140; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->lex, __pyx_v_lex, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 140; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "spacy/en.pyx":141 + * + * STRINGS[word.lex] = lex + * STRINGS[word.normed] = normed # <<<<<<<<<<<<<< + * STRINGS[word.last3] = last3 + * + */ + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 141; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->normed, __pyx_v_normed, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 141; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "spacy/en.pyx":142 + * STRINGS[word.lex] = lex + * STRINGS[word.normed] = normed + * STRINGS[word.last3] = last3 # <<<<<<<<<<<<<< + * + * # These are loaded later + */ + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->last3, __pyx_v_last3, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + + /* "spacy/en.pyx":145 + * + * # These are loaded later + * word.prob = 0 # <<<<<<<<<<<<<< + * word.cluster = 0 + * word.oft_upper = False + */ + __pyx_v_word->prob = 0.0; + + /* "spacy/en.pyx":146 + * # These are loaded later + * word.prob = 0 + * word.cluster = 0 # <<<<<<<<<<<<<< + * word.oft_upper = False + * word.oft_title = False + */ + __pyx_v_word->cluster = 0; + + /* "spacy/en.pyx":147 + * word.prob = 0 + * word.cluster = 0 + * word.oft_upper = False # <<<<<<<<<<<<<< + * word.oft_title = False + * + */ + __pyx_v_word->oft_upper = 0; + + /* "spacy/en.pyx":148 + * word.cluster = 0 + * word.oft_upper = False + * word.oft_title = False # <<<<<<<<<<<<<< + * + * # Now recurse, and deal with the tail + */ + __pyx_v_word->oft_title = 0; + + /* "spacy/en.pyx":151 + * + * # Now recurse, and deal with the tail + * if tail_string: # <<<<<<<<<<<<<< + * word.tail = lookup(tail_string) + * return word + */ + __pyx_t_5 = (__pyx_v_tail_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_tail_string) != 0); + if (__pyx_t_5) { + + /* "spacy/en.pyx":152 + * # Now recurse, and deal with the tail + * if tail_string: + * word.tail = lookup(tail_string) # <<<<<<<<<<<<<< + * return word + * + */ + __pyx_t_9 = __pyx_f_5spacy_2en_lookup(__pyx_v_tail_string, 0); if (unlikely(__pyx_t_9 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word->tail = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_t_9); + goto __pyx_L4; + } + __pyx_L4:; + + /* "spacy/en.pyx":153 + * if tail_string: + * word.tail = lookup(tail_string) + * return word # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = __pyx_v_word; + goto __pyx_L0; + + /* "spacy/en.pyx":113 + * + * + * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< + * int split, size_t length) except NULL: + * assert split <= length + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_6); + __Pyx_AddTraceback("spacy.en._init_lexeme", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_tail_string); + __Pyx_XDECREF(__pyx_v_lex); + __Pyx_XDECREF(__pyx_v_normed); + __Pyx_XDECREF(__pyx_v_last3); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":156 + * + * + * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< + * cdef size_t i = 0 + * if word[0].isalnum(): + */ + +static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __pyx_v_length) { + size_t __pyx_v_i; + size_t __pyx_r; + __Pyx_RefNannyDeclarations + Py_UCS4 __pyx_t_1; + int __pyx_t_2; + int __pyx_t_3; + int __pyx_t_4; + size_t __pyx_t_5; + Py_UCS4 __pyx_t_6; + int __pyx_t_7; + int __pyx_t_8; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("_find_split", 0); + + /* "spacy/en.pyx":157 + * + * cdef size_t _find_split(unicode word, size_t length): + * cdef size_t i = 0 # <<<<<<<<<<<<<< + * if word[0].isalnum(): + * while i < length and word[i].isalnum(): + */ + __pyx_v_i = 0; + + /* "spacy/en.pyx":158 + * cdef size_t _find_split(unicode word, size_t length): + * cdef size_t i = 0 + * if word[0].isalnum(): # <<<<<<<<<<<<<< + * while i < length and word[i].isalnum(): + * i += 1 + */ + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); + if ((__pyx_t_2 != 0)) { + + /* "spacy/en.pyx":159 + * cdef size_t i = 0 + * if word[0].isalnum(): + * while i < length and word[i].isalnum(): # <<<<<<<<<<<<<< + * i += 1 + * else: + */ + while (1) { + __pyx_t_2 = (__pyx_v_i < __pyx_v_length); + if (__pyx_t_2) { + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_3 = Py_UNICODE_ISALNUM(__pyx_t_1); + __pyx_t_4 = (__pyx_t_3 != 0); + } else { + __pyx_t_4 = __pyx_t_2; + } + if (!__pyx_t_4) break; + + /* "spacy/en.pyx":160 + * if word[0].isalnum(): + * while i < length and word[i].isalnum(): + * i += 1 # <<<<<<<<<<<<<< + * else: + * # Split off a punctuation character, or a sequence of the same punctuation character + */ + __pyx_v_i = (__pyx_v_i + 1); + } + goto __pyx_L3; + } + /*else*/ { + + /* "spacy/en.pyx":163 + * else: + * # Split off a punctuation character, or a sequence of the same punctuation character + * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): # <<<<<<<<<<<<<< + * i += 1 + * return i + */ + while (1) { + __pyx_t_4 = ((__pyx_v_i < __pyx_v_length) != 0); + if (__pyx_t_4) { + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); + __pyx_t_3 = ((!(__pyx_t_2 != 0)) != 0); + if (__pyx_t_3) { + __pyx_t_2 = ((__pyx_v_i == 0) != 0); + if (!__pyx_t_2) { + __pyx_t_5 = (__pyx_v_i - 1); + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_t_5, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_6 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_6 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_7 = ((__pyx_t_1 == __pyx_t_6) != 0); + __pyx_t_8 = __pyx_t_7; + } else { + __pyx_t_8 = __pyx_t_2; + } + __pyx_t_2 = __pyx_t_8; + } else { + __pyx_t_2 = __pyx_t_3; + } + __pyx_t_3 = __pyx_t_2; + } else { + __pyx_t_3 = __pyx_t_4; + } + if (!__pyx_t_3) break; + + /* "spacy/en.pyx":164 + * # Split off a punctuation character, or a sequence of the same punctuation character + * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): + * i += 1 # <<<<<<<<<<<<<< + * return i + */ + __pyx_v_i = (__pyx_v_i + 1); + } + } + __pyx_L3:; + + /* "spacy/en.pyx":165 + * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): + * i += 1 + * return i # <<<<<<<<<<<<<< + */ + __pyx_r = __pyx_v_i; + goto __pyx_L0; + + /* "spacy/en.pyx":156 + * + * + * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< + * cdef size_t i = 0 + * if word[0].isalnum(): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_WriteUnraisable("spacy.en._find_split", __pyx_clineno, __pyx_lineno, __pyx_filename, 0); + __pyx_r = 0; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyMethodDef __pyx_methods[] = { + {__Pyx_NAMESTR("lookup"), (PyCFunction)__pyx_pw_5spacy_2en_1lookup, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_lookup)}, + {__Pyx_NAMESTR("lookup_chunk"), (PyCFunction)__pyx_pw_5spacy_2en_3lookup_chunk, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_2lookup_chunk)}, + {__Pyx_NAMESTR("unhash"), (PyCFunction)__pyx_pw_5spacy_2en_5unhash, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_4unhash)}, + {__Pyx_NAMESTR("_substr"), (PyCFunction)__pyx_pw_5spacy_2en_7_substr, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}, + {0, 0, 0, 0} +}; + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef __pyx_moduledef = { + #if PY_VERSION_HEX < 0x03020000 + { PyObject_HEAD_INIT(NULL) NULL, 0, NULL }, + #else + PyModuleDef_HEAD_INIT, + #endif + __Pyx_NAMESTR("en"), + __Pyx_DOCSTR(__pyx_k_Serve_pointers_to_Lexeme_structs), /* m_doc */ + -1, /* m_size */ + __pyx_methods /* m_methods */, + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; +#endif + +static __Pyx_StringTabEntry __pyx_string_tab[] = { + {&__pyx_kp_u_, __pyx_k_, sizeof(__pyx_k_), 0, 1, 0, 0}, + {&__pyx_kp_u_DIGITS, __pyx_k_DIGITS, sizeof(__pyx_k_DIGITS), 0, 1, 0, 0}, + {&__pyx_n_s_LEXEMES, __pyx_k_LEXEMES, sizeof(__pyx_k_LEXEMES), 0, 0, 1, 1}, + {&__pyx_n_s_STRINGS, __pyx_k_STRINGS, sizeof(__pyx_k_STRINGS), 0, 0, 1, 1}, + {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1}, + {&__pyx_kp_u_YEAR, __pyx_k_YEAR, sizeof(__pyx_k_YEAR), 0, 1, 0, 0}, + {&__pyx_n_s_cluster, __pyx_k_cluster, sizeof(__pyx_k_cluster), 0, 0, 1, 1}, + {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1}, + {&__pyx_n_s_first, __pyx_k_first, sizeof(__pyx_k_first), 0, 0, 1, 1}, + {&__pyx_n_s_isdigit, __pyx_k_isdigit, sizeof(__pyx_k_isdigit), 0, 0, 1, 1}, + {&__pyx_n_s_last3, __pyx_k_last3, sizeof(__pyx_k_last3), 0, 0, 1, 1}, + {&__pyx_n_s_length, __pyx_k_length, sizeof(__pyx_k_length), 0, 0, 1, 1}, + {&__pyx_n_s_lex, __pyx_k_lex, sizeof(__pyx_k_lex), 0, 0, 1, 1}, + {&__pyx_n_s_lower, __pyx_k_lower, sizeof(__pyx_k_lower), 0, 0, 1, 1}, + {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, + {&__pyx_n_s_normed, __pyx_k_normed, sizeof(__pyx_k_normed), 0, 0, 1, 1}, + {&__pyx_n_s_oft_title, __pyx_k_oft_title, sizeof(__pyx_k_oft_title), 0, 0, 1, 1}, + {&__pyx_n_s_oft_upper, __pyx_k_oft_upper, sizeof(__pyx_k_oft_upper), 0, 0, 1, 1}, + {&__pyx_n_s_prob, __pyx_k_prob, sizeof(__pyx_k_prob), 0, 0, 1, 1}, + {&__pyx_n_s_pyx_capi, __pyx_k_pyx_capi, sizeof(__pyx_k_pyx_capi), 0, 0, 1, 1}, + {&__pyx_n_s_sic, __pyx_k_sic, sizeof(__pyx_k_sic), 0, 0, 1, 1}, + {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1}, + {&__pyx_n_s_string, __pyx_k_string, sizeof(__pyx_k_string), 0, 0, 1, 1}, + {&__pyx_n_s_tail, __pyx_k_tail, sizeof(__pyx_k_tail), 0, 0, 1, 1}, + {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 0, 0} +}; +static int __Pyx_InitCachedBuiltins(void) { + __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + return 0; + __pyx_L1_error:; + return -1; +} + +static int __Pyx_InitCachedConstants(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_InitGlobals(void) { + if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + return 0; + __pyx_L1_error:; + return -1; +} + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC initen(void); /*proto*/ +PyMODINIT_FUNC initen(void) +#else +PyMODINIT_FUNC PyInit_en(void); /*proto*/ +PyMODINIT_FUNC PyInit_en(void) +#endif +{ + PyObject *__pyx_t_1 = NULL; + struct __pyx_t_5spacy_6lexeme_Lexeme __pyx_t_2; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannyDeclarations + #if CYTHON_REFNANNY + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); + if (!__Pyx_RefNanny) { + PyErr_Clear(); + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); + if (!__Pyx_RefNanny) + Py_FatalError("failed to import 'refnanny' module"); + } + #endif + __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_en(void)", 0); + if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #ifdef __Pyx_CyFunction_USED + if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_FusedFunction_USED + if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_Generator_USED + if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + /*--- Library function declarations ---*/ + /*--- Threads initialization code ---*/ + #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS + #ifdef WITH_THREAD /* Python build with threading support? */ + PyEval_InitThreads(); + #endif + #endif + /*--- Module creation code ---*/ + #if PY_MAJOR_VERSION < 3 + __pyx_m = Py_InitModule4(__Pyx_NAMESTR("en"), __pyx_methods, __Pyx_DOCSTR(__pyx_k_Serve_pointers_to_Lexeme_structs), 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); + #else + __pyx_m = PyModule_Create(&__pyx_moduledef); + #endif + if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + Py_INCREF(__pyx_d); + __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if CYTHON_COMPILING_IN_PYPY + Py_INCREF(__pyx_b); + #endif + if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + /*--- Initialize various global constants etc. ---*/ + if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT) + if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + if (__pyx_module_is_main_spacy__en) { + if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + } + #if PY_MAJOR_VERSION >= 3 + { + PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!PyDict_GetItemString(modules, "spacy.en")) { + if (unlikely(PyDict_SetItemString(modules, "spacy.en", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + /*--- Builtin init code ---*/ + if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Constants init code ---*/ + if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Global init code ---*/ + /*--- Variable export code ---*/ + if (__Pyx_ExportVoidPtr(__pyx_n_s_LEXEMES, (void *)&__pyx_v_5spacy_2en_LEXEMES, "google::dense_hash_map<__pyx_t_5spacy_6lexeme_StringHash,__pyx_t_5spacy_2en_Lexeme_ptr>") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Function export code ---*/ + if (__Pyx_ExportFunction("lookup", (void (*)(void))__pyx_f_5spacy_2en_lookup, "__pyx_t_5spacy_2en_Lexeme_addr (PyObject *, int __pyx_skip_dispatch)") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (__Pyx_ExportFunction("lookup_chunk", (void (*)(void))__pyx_f_5spacy_2en_lookup_chunk, "__pyx_t_5spacy_2en_Lexeme_addr (PyObject *, int, int, int __pyx_skip_dispatch)") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (__Pyx_ExportFunction("hash_string", (void (*)(void))__pyx_f_5spacy_2en_hash_string, "__pyx_t_5spacy_6lexeme_StringHash (PyObject *, size_t)") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (__Pyx_ExportFunction("unhash", (void (*)(void))__pyx_f_5spacy_2en_unhash, "PyObject *(__pyx_t_5spacy_6lexeme_StringHash, int __pyx_skip_dispatch)") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Type init code ---*/ + /*--- Type import code ---*/ + /*--- Variable import code ---*/ + /*--- Function import code ---*/ + /*--- Execution code ---*/ + + /* "spacy/en.pyx":15 + * + * + * STRINGS = {} # <<<<<<<<<<<<<< + * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() + * LEXEMES.set_empty_key(0) + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_STRINGS, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "spacy/en.pyx":16 + * + * STRINGS = {} + * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() # <<<<<<<<<<<<<< + * LEXEMES.set_empty_key(0) + * + */ + __pyx_v_5spacy_2en_LEXEMES = google::dense_hash_map<__pyx_t_5spacy_6lexeme_StringHash,__pyx_t_5spacy_2en_Lexeme_ptr>(); + + /* "spacy/en.pyx":17 + * STRINGS = {} + * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() + * LEXEMES.set_empty_key(0) # <<<<<<<<<<<<<< + * + * + */ + __pyx_v_5spacy_2en_LEXEMES.set_empty_key(0); + + /* "spacy/en.pyx":20 + * + * + * cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) # <<<<<<<<<<<<<< + * + * + */ + __pyx_t_2.sic = 0; + __pyx_t_2.lex = 0; + __pyx_t_2.normed = 0; + __pyx_t_2.last3 = 0; + __pyx_t_2.first = 0; + __pyx_t_2.prob = 0.0; + __pyx_t_2.cluster = 0; + __pyx_t_2.oft_upper = 0; + __pyx_t_2.oft_title = 0; + __pyx_t_2.tail = NULL; + __pyx_v_5spacy_2en_BLANK_WORD = __pyx_t_2; + + /* "spacy/en.pyx":1 + * '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, # <<<<<<<<<<<<<< + * so that strings can be retrieved from hashes. Use 64-bit hash values and + * boldly assume no collisions. + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + if (__pyx_m) { + __Pyx_AddTraceback("init spacy.en", __pyx_clineno, __pyx_lineno, __pyx_filename); + Py_DECREF(__pyx_m); __pyx_m = 0; + } else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ImportError, "init spacy.en"); + } + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + #if PY_MAJOR_VERSION < 3 + return; + #else + return __pyx_m; + #endif +} + +/* Runtime support code */ +#if CYTHON_REFNANNY +static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) { + PyObject *m = NULL, *p = NULL; + void *r = NULL; + m = PyImport_ImportModule((char *)modname); + if (!m) goto end; + p = PyObject_GetAttrString(m, (char *)"RefNannyAPI"); + if (!p) goto end; + r = PyLong_AsVoidPtr(p); +end: + Py_XDECREF(p); + Py_XDECREF(m); + return (__Pyx_RefNannyAPIStruct *)r; +} +#endif /* CYTHON_REFNANNY */ + +static PyObject *__Pyx_GetBuiltinName(PyObject *name) { + PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name); + if (unlikely(!result)) { + PyErr_Format(PyExc_NameError, +#if PY_MAJOR_VERSION >= 3 + "name '%U' is not defined", name); +#else + "name '%.200s' is not defined", PyString_AS_STRING(name)); +#endif + } + return result; +} + +static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) { +#if CYTHON_COMPILING_IN_PYPY + return PyObject_RichCompareBool(s1, s2, equals); +#else + if (s1 == s2) { + return (equals == Py_EQ); + } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) { + const char *ps1, *ps2; + Py_ssize_t length = PyBytes_GET_SIZE(s1); + if (length != PyBytes_GET_SIZE(s2)) + return (equals == Py_NE); + ps1 = PyBytes_AS_STRING(s1); + ps2 = PyBytes_AS_STRING(s2); + if (ps1[0] != ps2[0]) { + return (equals == Py_NE); + } else if (length == 1) { + return (equals == Py_EQ); + } else { + int result = memcmp(ps1, ps2, (size_t)length); + return (equals == Py_EQ) ? (result == 0) : (result != 0); + } + } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) { + return (equals == Py_NE); + } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) { + return (equals == Py_NE); + } else { + int result; + PyObject* py_result = PyObject_RichCompare(s1, s2, equals); + if (!py_result) + return -1; + result = __Pyx_PyObject_IsTrue(py_result); + Py_DECREF(py_result); + return result; + } +#endif +} + +static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals) { +#if CYTHON_COMPILING_IN_PYPY + return PyObject_RichCompareBool(s1, s2, equals); +#else +#if PY_MAJOR_VERSION < 3 + PyObject* owned_ref = NULL; +#endif + int s1_is_unicode, s2_is_unicode; + if (s1 == s2) { + goto return_eq; + } + s1_is_unicode = PyUnicode_CheckExact(s1); + s2_is_unicode = PyUnicode_CheckExact(s2); +#if PY_MAJOR_VERSION < 3 + if ((s1_is_unicode & (!s2_is_unicode)) && PyString_CheckExact(s2)) { + owned_ref = PyUnicode_FromObject(s2); + if (unlikely(!owned_ref)) + return -1; + s2 = owned_ref; + s2_is_unicode = 1; + } else if ((s2_is_unicode & (!s1_is_unicode)) && PyString_CheckExact(s1)) { + owned_ref = PyUnicode_FromObject(s1); + if (unlikely(!owned_ref)) + return -1; + s1 = owned_ref; + s1_is_unicode = 1; + } else if (((!s2_is_unicode) & (!s1_is_unicode))) { + return __Pyx_PyBytes_Equals(s1, s2, equals); + } +#endif + if (s1_is_unicode & s2_is_unicode) { + Py_ssize_t length; + int kind; + void *data1, *data2; + #if CYTHON_PEP393_ENABLED + if (unlikely(PyUnicode_READY(s1) < 0) || unlikely(PyUnicode_READY(s2) < 0)) + return -1; + #endif + length = __Pyx_PyUnicode_GET_LENGTH(s1); + if (length != __Pyx_PyUnicode_GET_LENGTH(s2)) { + goto return_ne; + } + kind = __Pyx_PyUnicode_KIND(s1); + if (kind != __Pyx_PyUnicode_KIND(s2)) { + goto return_ne; + } + data1 = __Pyx_PyUnicode_DATA(s1); + data2 = __Pyx_PyUnicode_DATA(s2); + if (__Pyx_PyUnicode_READ(kind, data1, 0) != __Pyx_PyUnicode_READ(kind, data2, 0)) { + goto return_ne; + } else if (length == 1) { + goto return_eq; + } else { + int result = memcmp(data1, data2, length * kind); + #if PY_MAJOR_VERSION < 3 + Py_XDECREF(owned_ref); + #endif + return (equals == Py_EQ) ? (result == 0) : (result != 0); + } + } else if ((s1 == Py_None) & s2_is_unicode) { + goto return_ne; + } else if ((s2 == Py_None) & s1_is_unicode) { + goto return_ne; + } else { + int result; + PyObject* py_result = PyObject_RichCompare(s1, s2, equals); + if (!py_result) + return -1; + result = __Pyx_PyObject_IsTrue(py_result); + Py_DECREF(py_result); + return result; + } +return_eq: + #if PY_MAJOR_VERSION < 3 + Py_XDECREF(owned_ref); + #endif + return (equals == Py_EQ); +return_ne: + #if PY_MAJOR_VERSION < 3 + Py_XDECREF(owned_ref); + #endif + return (equals == Py_NE); +#endif +} + +static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) { + PyErr_Format(PyExc_TypeError, + "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", + name, type->tp_name, Py_TYPE(obj)->tp_name); +} +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact) +{ + if (unlikely(!type)) { + PyErr_SetString(PyExc_SystemError, "Missing type object"); + return 0; + } + if (none_allowed && obj == Py_None) return 1; + else if (exact) { + if (likely(Py_TYPE(obj) == type)) return 1; + #if PY_MAJOR_VERSION == 2 + else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; + #endif + } + else { + if (likely(PyObject_TypeCheck(obj, type))) return 1; + } + __Pyx_RaiseArgumentTypeInvalid(name, obj, type); + return 0; +} + +static void __Pyx_RaiseArgtupleInvalid( + const char* func_name, + int exact, + Py_ssize_t num_min, + Py_ssize_t num_max, + Py_ssize_t num_found) +{ + Py_ssize_t num_expected; + const char *more_or_less; + if (num_found < num_min) { + num_expected = num_min; + more_or_less = "at least"; + } else { + num_expected = num_max; + more_or_less = "at most"; + } + if (exact) { + more_or_less = "exactly"; + } + PyErr_Format(PyExc_TypeError, + "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)", + func_name, more_or_less, num_expected, + (num_expected == 1) ? "" : "s", num_found); +} + +static void __Pyx_RaiseDoubleKeywordsError( + const char* func_name, + PyObject* kw_name) +{ + PyErr_Format(PyExc_TypeError, + #if PY_MAJOR_VERSION >= 3 + "%s() got multiple values for keyword argument '%U'", func_name, kw_name); + #else + "%s() got multiple values for keyword argument '%s'", func_name, + PyString_AsString(kw_name)); + #endif +} + +static int __Pyx_ParseOptionalKeywords( + PyObject *kwds, + PyObject **argnames[], + PyObject *kwds2, + PyObject *values[], + Py_ssize_t num_pos_args, + const char* function_name) +{ + PyObject *key = 0, *value = 0; + Py_ssize_t pos = 0; + PyObject*** name; + PyObject*** first_kw_arg = argnames + num_pos_args; + while (PyDict_Next(kwds, &pos, &key, &value)) { + name = first_kw_arg; + while (*name && (**name != key)) name++; + if (*name) { + values[name-argnames] = value; + continue; + } + name = first_kw_arg; + #if PY_MAJOR_VERSION < 3 + if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) { + while (*name) { + if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key)) + && _PyString_Eq(**name, key)) { + values[name-argnames] = value; + break; + } + name++; + } + if (*name) continue; + else { + PyObject*** argname = argnames; + while (argname != first_kw_arg) { + if ((**argname == key) || ( + (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key)) + && _PyString_Eq(**argname, key))) { + goto arg_passed_twice; + } + argname++; + } + } + } else + #endif + if (likely(PyUnicode_Check(key))) { + while (*name) { + int cmp = (**name == key) ? 0 : + #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 + (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 : + #endif + PyUnicode_Compare(**name, key); + if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; + if (cmp == 0) { + values[name-argnames] = value; + break; + } + name++; + } + if (*name) continue; + else { + PyObject*** argname = argnames; + while (argname != first_kw_arg) { + int cmp = (**argname == key) ? 0 : + #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 + (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 : + #endif + PyUnicode_Compare(**argname, key); + if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; + if (cmp == 0) goto arg_passed_twice; + argname++; + } + } + } else + goto invalid_keyword_type; + if (kwds2) { + if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad; + } else { + goto invalid_keyword; + } + } + return 0; +arg_passed_twice: + __Pyx_RaiseDoubleKeywordsError(function_name, key); + goto bad; +invalid_keyword_type: + PyErr_Format(PyExc_TypeError, + "%.200s() keywords must be strings", function_name); + goto bad; +invalid_keyword: + PyErr_Format(PyExc_TypeError, + #if PY_MAJOR_VERSION < 3 + "%.200s() got an unexpected keyword argument '%.200s'", + function_name, PyString_AsString(key)); + #else + "%s() got an unexpected keyword argument '%U'", + function_name, key); + #endif +bad: + return -1; +} + +static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) { + PyObject *result; +#if CYTHON_COMPILING_IN_CPYTHON + result = PyDict_GetItem(__pyx_d, name); + if (result) { + Py_INCREF(result); + } else { +#else + result = PyObject_GetItem(__pyx_d, name); + if (!result) { + PyErr_Clear(); +#endif + result = __Pyx_GetBuiltinName(name); + } + return result; +} + +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) { + PyObject *r; + if (!j) return NULL; + r = PyObject_GetItem(o, j); + Py_DECREF(j); + return r; +} +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (wraparound & unlikely(i < 0)) i += PyList_GET_SIZE(o); + if ((!boundscheck) || likely((0 <= i) & (i < PyList_GET_SIZE(o)))) { + PyObject *r = PyList_GET_ITEM(o, i); + Py_INCREF(r); + return r; + } + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); +#else + return PySequence_GetItem(o, i); +#endif +} +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i, + int wraparound, int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (wraparound & unlikely(i < 0)) i += PyTuple_GET_SIZE(o); + if ((!boundscheck) || likely((0 <= i) & (i < PyTuple_GET_SIZE(o)))) { + PyObject *r = PyTuple_GET_ITEM(o, i); + Py_INCREF(r); + return r; + } + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); +#else + return PySequence_GetItem(o, i); +#endif +} +static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, + int is_list, int wraparound, int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (is_list || PyList_CheckExact(o)) { + Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o); + if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) { + PyObject *r = PyList_GET_ITEM(o, n); + Py_INCREF(r); + return r; + } + } + else if (PyTuple_CheckExact(o)) { + Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o); + if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) { + PyObject *r = PyTuple_GET_ITEM(o, n); + Py_INCREF(r); + return r; + } + } else { + PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence; + if (likely(m && m->sq_item)) { + if (wraparound && unlikely(i < 0) && likely(m->sq_length)) { + Py_ssize_t l = m->sq_length(o); + if (likely(l >= 0)) { + i += l; + } else { + if (PyErr_ExceptionMatches(PyExc_OverflowError)) + PyErr_Clear(); + else + return NULL; + } + } + return m->sq_item(o, i); + } + } +#else + if (is_list || PySequence_Check(o)) { + return PySequence_GetItem(o, i); + } +#endif + return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i)); +} + +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) { + PyObject *result; + ternaryfunc call = func->ob_type->tp_call; + if (unlikely(!call)) + return PyObject_Call(func, arg, kw); +#if PY_VERSION_HEX >= 0x02060000 + if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object"))) + return NULL; +#endif + result = (*call)(func, arg, kw); +#if PY_VERSION_HEX >= 0x02060000 + Py_LeaveRecursiveCall(); +#endif + if (unlikely(!result) && unlikely(!PyErr_Occurred())) { + PyErr_SetString( + PyExc_SystemError, + "NULL result without error in PyObject_Call"); + } + return result; +} +#endif + +static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb) { +#if CYTHON_COMPILING_IN_CPYTHON + PyObject *tmp_type, *tmp_value, *tmp_tb; + PyThreadState *tstate = PyThreadState_GET(); + tmp_type = tstate->curexc_type; + tmp_value = tstate->curexc_value; + tmp_tb = tstate->curexc_traceback; + tstate->curexc_type = type; + tstate->curexc_value = value; + tstate->curexc_traceback = tb; + Py_XDECREF(tmp_type); + Py_XDECREF(tmp_value); + Py_XDECREF(tmp_tb); +#else + PyErr_Restore(type, value, tb); +#endif +} +static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb) { +#if CYTHON_COMPILING_IN_CPYTHON + PyThreadState *tstate = PyThreadState_GET(); + *type = tstate->curexc_type; + *value = tstate->curexc_value; + *tb = tstate->curexc_traceback; + tstate->curexc_type = 0; + tstate->curexc_value = 0; + tstate->curexc_traceback = 0; +#else + PyErr_Fetch(type, value, tb); +#endif +} + +#if PY_MAJOR_VERSION < 3 +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, + CYTHON_UNUSED PyObject *cause) { + Py_XINCREF(type); + if (!value || value == Py_None) + value = NULL; + else + Py_INCREF(value); + if (!tb || tb == Py_None) + tb = NULL; + else { + Py_INCREF(tb); + if (!PyTraceBack_Check(tb)) { + PyErr_SetString(PyExc_TypeError, + "raise: arg 3 must be a traceback or None"); + goto raise_error; + } + } + #if PY_VERSION_HEX < 0x02050000 + if (PyClass_Check(type)) { + #else + if (PyType_Check(type)) { + #endif +#if CYTHON_COMPILING_IN_PYPY + if (!value) { + Py_INCREF(Py_None); + value = Py_None; + } +#endif + PyErr_NormalizeException(&type, &value, &tb); + } else { + if (value) { + PyErr_SetString(PyExc_TypeError, + "instance exception may not have a separate value"); + goto raise_error; + } + value = type; + #if PY_VERSION_HEX < 0x02050000 + if (PyInstance_Check(type)) { + type = (PyObject*) ((PyInstanceObject*)type)->in_class; + Py_INCREF(type); + } else { + type = 0; + PyErr_SetString(PyExc_TypeError, + "raise: exception must be an old-style class or instance"); + goto raise_error; + } + #else + type = (PyObject*) Py_TYPE(type); + Py_INCREF(type); + if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) { + PyErr_SetString(PyExc_TypeError, + "raise: exception class must be a subclass of BaseException"); + goto raise_error; + } + #endif + } + __Pyx_ErrRestore(type, value, tb); + return; +raise_error: + Py_XDECREF(value); + Py_XDECREF(type); + Py_XDECREF(tb); + return; +} +#else /* Python 3+ */ +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) { + PyObject* owned_instance = NULL; + if (tb == Py_None) { + tb = 0; + } else if (tb && !PyTraceBack_Check(tb)) { + PyErr_SetString(PyExc_TypeError, + "raise: arg 3 must be a traceback or None"); + goto bad; + } + if (value == Py_None) + value = 0; + if (PyExceptionInstance_Check(type)) { + if (value) { + PyErr_SetString(PyExc_TypeError, + "instance exception may not have a separate value"); + goto bad; + } + value = type; + type = (PyObject*) Py_TYPE(value); + } else if (PyExceptionClass_Check(type)) { + PyObject *instance_class = NULL; + if (value && PyExceptionInstance_Check(value)) { + instance_class = (PyObject*) Py_TYPE(value); + if (instance_class != type) { + if (PyObject_IsSubclass(instance_class, type)) { + type = instance_class; + } else { + instance_class = NULL; + } + } + } + if (!instance_class) { + PyObject *args; + if (!value) + args = PyTuple_New(0); + else if (PyTuple_Check(value)) { + Py_INCREF(value); + args = value; + } else + args = PyTuple_Pack(1, value); + if (!args) + goto bad; + owned_instance = PyObject_Call(type, args, NULL); + Py_DECREF(args); + if (!owned_instance) + goto bad; + value = owned_instance; + if (!PyExceptionInstance_Check(value)) { + PyErr_Format(PyExc_TypeError, + "calling %R should have returned an instance of " + "BaseException, not %R", + type, Py_TYPE(value)); + goto bad; + } + } + } else { + PyErr_SetString(PyExc_TypeError, + "raise: exception class must be a subclass of BaseException"); + goto bad; + } +#if PY_VERSION_HEX >= 0x03030000 + if (cause) { +#else + if (cause && cause != Py_None) { +#endif + PyObject *fixed_cause; + if (cause == Py_None) { + fixed_cause = NULL; + } else if (PyExceptionClass_Check(cause)) { + fixed_cause = PyObject_CallObject(cause, NULL); + if (fixed_cause == NULL) + goto bad; + } else if (PyExceptionInstance_Check(cause)) { + fixed_cause = cause; + Py_INCREF(fixed_cause); + } else { + PyErr_SetString(PyExc_TypeError, + "exception causes must derive from " + "BaseException"); + goto bad; + } + PyException_SetCause(value, fixed_cause); + } + PyErr_SetObject(type, value); + if (tb) { + PyThreadState *tstate = PyThreadState_GET(); + PyObject* tmp_tb = tstate->curexc_traceback; + if (tb != tmp_tb) { + Py_INCREF(tb); + tstate->curexc_traceback = tb; + Py_XDECREF(tmp_tb); + } + } +bad: + Py_XDECREF(owned_instance); + return; +} +#endif + +static CYTHON_INLINE Py_UCS4 __Pyx_GetItemInt_Unicode_Fast(PyObject* ustring, Py_ssize_t i, + int wraparound, int boundscheck) { + Py_ssize_t length; +#if CYTHON_PEP393_ENABLED + if (unlikely(__Pyx_PyUnicode_READY(ustring) < 0)) return (Py_UCS4)-1; +#endif + if (wraparound | boundscheck) { + length = __Pyx_PyUnicode_GET_LENGTH(ustring); + if (wraparound & unlikely(i < 0)) i += length; + if ((!boundscheck) || likely((0 <= i) & (i < length))) { + return __Pyx_PyUnicode_READ_CHAR(ustring, i); + } else { + PyErr_SetString(PyExc_IndexError, "string index out of range"); + return (Py_UCS4)-1; + } + } else { + return __Pyx_PyUnicode_READ_CHAR(ustring, i); + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_Substring( + PyObject* text, Py_ssize_t start, Py_ssize_t stop) { + Py_ssize_t length; +#if CYTHON_PEP393_ENABLED + if (unlikely(PyUnicode_READY(text) == -1)) return NULL; + length = PyUnicode_GET_LENGTH(text); +#else + length = PyUnicode_GET_SIZE(text); +#endif + if (start < 0) { + start += length; + if (start < 0) + start = 0; + } + if (stop < 0) + stop += length; + else if (stop > length) + stop = length; + length = stop - start; + if (length <= 0) + return PyUnicode_FromUnicode(NULL, 0); +#if CYTHON_PEP393_ENABLED + return PyUnicode_FromKindAndData(PyUnicode_KIND(text), + PyUnicode_1BYTE_DATA(text) + start*PyUnicode_KIND(text), stop-start); +#else + return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text)+start, stop-start); +#endif +} + +static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) { + int r; + if (!j) return -1; + r = PyObject_SetItem(o, j, v); + Py_DECREF(j); + return r; +} +static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, + int is_list, int wraparound, int boundscheck) { +#if CYTHON_COMPILING_IN_CPYTHON + if (is_list || PyList_CheckExact(o)) { + Py_ssize_t n = (!wraparound) ? i : ((likely(i >= 0)) ? i : i + PyList_GET_SIZE(o)); + if ((!boundscheck) || likely((n >= 0) & (n < PyList_GET_SIZE(o)))) { + PyObject* old = PyList_GET_ITEM(o, n); + Py_INCREF(v); + PyList_SET_ITEM(o, n, v); + Py_DECREF(old); + return 1; + } + } else { + PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence; + if (likely(m && m->sq_ass_item)) { + if (wraparound && unlikely(i < 0) && likely(m->sq_length)) { + Py_ssize_t l = m->sq_length(o); + if (likely(l >= 0)) { + i += l; + } else { + if (PyErr_ExceptionMatches(PyExc_OverflowError)) + PyErr_Clear(); + else + return -1; + } + } + return m->sq_ass_item(o, i, v); + } + } +#else +#if CYTHON_COMPILING_IN_PYPY + if (is_list || (PySequence_Check(o) && !PyDict_Check(o))) { +#else + if (is_list || PySequence_Check(o)) { +#endif + return PySequence_SetItem(o, i, v); + } +#endif + return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v); +} + +static void __Pyx_WriteUnraisable(const char *name, CYTHON_UNUSED int clineno, + CYTHON_UNUSED int lineno, CYTHON_UNUSED const char *filename, + int full_traceback) { + PyObject *old_exc, *old_val, *old_tb; + PyObject *ctx; + __Pyx_ErrFetch(&old_exc, &old_val, &old_tb); + if (full_traceback) { + Py_XINCREF(old_exc); + Py_XINCREF(old_val); + Py_XINCREF(old_tb); + __Pyx_ErrRestore(old_exc, old_val, old_tb); + PyErr_PrintEx(1); + } + #if PY_MAJOR_VERSION < 3 + ctx = PyString_FromString(name); + #else + ctx = PyUnicode_FromString(name); + #endif + __Pyx_ErrRestore(old_exc, old_val, old_tb); + if (!ctx) { + PyErr_WriteUnraisable(Py_None); + } else { + PyErr_WriteUnraisable(ctx); + Py_DECREF(ctx); + } +} + +#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func) \ + { \ + func_type value = func(x); \ + if (sizeof(target_type) < sizeof(func_type)) { \ + if (unlikely(value != (func_type) (target_type) value)) { \ + func_type zero = 0; \ + PyErr_SetString(PyExc_OverflowError, \ + (is_unsigned && unlikely(value < zero)) ? \ + "can't convert negative value to " #target_type : \ + "value too large to convert to " #target_type); \ + return (target_type) -1; \ + } \ + } \ + return (target_type) value; \ + } + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + const int neg_one = (int) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + return (int) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + if (sizeof(int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(int) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(int) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong) + } else if (sizeof(int) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + int val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (int) -1; + } + } else { + int val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (int) -1; + val = __Pyx_PyInt_As_int(tmp); + Py_DECREF(tmp); + return val; + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE uint64_t __Pyx_PyInt_As_uint64_t(PyObject *x) { + const uint64_t neg_one = (uint64_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(uint64_t) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(uint64_t, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to uint64_t"); + return (uint64_t) -1; + } + return (uint64_t) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(uint64_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (uint64_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to uint64_t"); + return (uint64_t) -1; + } + if (sizeof(uint64_t) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(uint64_t, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(uint64_t) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(uint64_t, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(uint64_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(uint64_t) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(uint64_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(uint64_t) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(uint64_t, long, PyLong_AsLong) + } else if (sizeof(uint64_t) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(uint64_t, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + uint64_t val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (uint64_t) -1; + } + } else { + uint64_t val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (uint64_t) -1; + val = __Pyx_PyInt_As_uint64_t(tmp); + Py_DECREF(tmp); + return val; + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) { + const size_t neg_one = (size_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(size_t) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + return (size_t) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + if (sizeof(size_t) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(size_t) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(size_t) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(size_t) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyLong_AsLong) + } else if (sizeof(size_t) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(size_t, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + size_t val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (size_t) -1; + } + } else { + size_t val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (size_t) -1; + val = __Pyx_PyInt_As_size_t(tmp); + Py_DECREF(tmp); + return val; + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_uint64_t(uint64_t value) { + const uint64_t neg_one = (uint64_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(uint64_t) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(uint64_t) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(uint64_t) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(uint64_t) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(uint64_t) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(uint64_t), + little, !is_unsigned); + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(long) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(long) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(long) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(long), + little, !is_unsigned); + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(long) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + return (long) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + if (sizeof(long) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(long) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(long) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(long) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong) + } else if (sizeof(long) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + long val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (long) -1; + } + } else { + long val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (long) -1; + val = __Pyx_PyInt_As_long(tmp); + Py_DECREF(tmp); + return val; + } +} + +static int __Pyx_check_binary_version(void) { + char ctversion[4], rtversion[4]; + PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); + PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); + if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) { + char message[200]; + PyOS_snprintf(message, sizeof(message), + "compiletime version %s of module '%.100s' " + "does not match runtime version %s", + ctversion, __Pyx_MODULE_NAME, rtversion); + #if PY_VERSION_HEX < 0x02050000 + return PyErr_Warn(NULL, message); + #else + return PyErr_WarnEx(NULL, message, 1); + #endif + } + return 0; +} + +static int __Pyx_ExportVoidPtr(PyObject *name, void *p, const char *sig) { + PyObject *d; + PyObject *cobj = 0; + d = PyDict_GetItem(__pyx_d, __pyx_n_s_pyx_capi); + Py_XINCREF(d); + if (!d) { + d = PyDict_New(); + if (!d) + goto bad; + if (__Pyx_PyObject_SetAttrStr(__pyx_m, __pyx_n_s_pyx_capi, d) < 0) + goto bad; + } +#if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION==3 && PY_MINOR_VERSION==0) + cobj = PyCapsule_New(p, sig, 0); +#else + cobj = PyCObject_FromVoidPtrAndDesc(p, (void *)sig, 0); +#endif + if (!cobj) + goto bad; + if (PyDict_SetItem(d, name, cobj) < 0) + goto bad; + Py_DECREF(cobj); + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(cobj); + Py_XDECREF(d); + return -1; +} + +static int __Pyx_ExportFunction(const char *name, void (*f)(void), const char *sig) { + PyObject *d = 0; + PyObject *cobj = 0; + union { + void (*fp)(void); + void *p; + } tmp; + d = PyObject_GetAttrString(__pyx_m, (char *)"__pyx_capi__"); + if (!d) { + PyErr_Clear(); + d = PyDict_New(); + if (!d) + goto bad; + Py_INCREF(d); + if (PyModule_AddObject(__pyx_m, (char *)"__pyx_capi__", d) < 0) + goto bad; + } + tmp.fp = f; +#if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION==3&&PY_MINOR_VERSION==0) + cobj = PyCapsule_New(tmp.p, sig, 0); +#else + cobj = PyCObject_FromVoidPtrAndDesc(tmp.p, (void *)sig, 0); +#endif + if (!cobj) + goto bad; + if (PyDict_SetItemString(d, name, cobj) < 0) + goto bad; + Py_DECREF(cobj); + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(cobj); + Py_XDECREF(d); + return -1; +} + +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + int start = 0, mid = 0, end = count - 1; + if (end >= 0 && code_line > entries[end].code_line) { + return count; + } + while (start < end) { + mid = (start + end) / 2; + if (code_line < entries[mid].code_line) { + end = mid; + } else if (code_line > entries[mid].code_line) { + start = mid + 1; + } else { + return mid; + } + } + if (code_line <= entries[mid].code_line) { + return mid; + } else { + return mid + 1; + } +} +static PyCodeObject *__pyx_find_code_object(int code_line) { + PyCodeObject* code_object; + int pos; + if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) { + return NULL; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) { + return NULL; + } + code_object = __pyx_code_cache.entries[pos].code_object; + Py_INCREF(code_object); + return code_object; +} +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { + int pos, i; + __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries; + if (unlikely(!code_line)) { + return; + } + if (unlikely(!entries)) { + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry)); + if (likely(entries)) { + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = 64; + __pyx_code_cache.count = 1; + entries[0].code_line = code_line; + entries[0].code_object = code_object; + Py_INCREF(code_object); + } + return; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) { + PyCodeObject* tmp = entries[pos].code_object; + entries[pos].code_object = code_object; + Py_DECREF(tmp); + return; + } + if (__pyx_code_cache.count == __pyx_code_cache.max_count) { + int new_max = __pyx_code_cache.max_count + 64; + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( + __pyx_code_cache.entries, new_max*sizeof(__Pyx_CodeObjectCacheEntry)); + if (unlikely(!entries)) { + return; + } + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = new_max; + } + for (i=__pyx_code_cache.count; i>pos; i--) { + entries[i] = entries[i-1]; + } + entries[pos].code_line = code_line; + entries[pos].code_object = code_object; + __pyx_code_cache.count++; + Py_INCREF(code_object); +} + +#include "compile.h" +#include "frameobject.h" +#include "traceback.h" +static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( + const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_srcfile = 0; + PyObject *py_funcname = 0; + #if PY_MAJOR_VERSION < 3 + py_srcfile = PyString_FromString(filename); + #else + py_srcfile = PyUnicode_FromString(filename); + #endif + if (!py_srcfile) goto bad; + if (c_line) { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #else + py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #endif + } + else { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromString(funcname); + #else + py_funcname = PyUnicode_FromString(funcname); + #endif + } + if (!py_funcname) goto bad; + py_code = __Pyx_PyCode_New( + 0, /*int argcount,*/ + 0, /*int kwonlyargcount,*/ + 0, /*int nlocals,*/ + 0, /*int stacksize,*/ + 0, /*int flags,*/ + __pyx_empty_bytes, /*PyObject *code,*/ + __pyx_empty_tuple, /*PyObject *consts,*/ + __pyx_empty_tuple, /*PyObject *names,*/ + __pyx_empty_tuple, /*PyObject *varnames,*/ + __pyx_empty_tuple, /*PyObject *freevars,*/ + __pyx_empty_tuple, /*PyObject *cellvars,*/ + py_srcfile, /*PyObject *filename,*/ + py_funcname, /*PyObject *name,*/ + py_line, /*int firstlineno,*/ + __pyx_empty_bytes /*PyObject *lnotab*/ + ); + Py_DECREF(py_srcfile); + Py_DECREF(py_funcname); + return py_code; +bad: + Py_XDECREF(py_srcfile); + Py_XDECREF(py_funcname); + return NULL; +} +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_globals = 0; + PyFrameObject *py_frame = 0; + py_code = __pyx_find_code_object(c_line ? c_line : py_line); + if (!py_code) { + py_code = __Pyx_CreateCodeObjectForTraceback( + funcname, c_line, py_line, filename); + if (!py_code) goto bad; + __pyx_insert_code_object(c_line ? c_line : py_line, py_code); + } + py_globals = PyModule_GetDict(__pyx_m); + if (!py_globals) goto bad; + py_frame = PyFrame_New( + PyThreadState_GET(), /*PyThreadState *tstate,*/ + py_code, /*PyCodeObject *code,*/ + py_globals, /*PyObject *globals,*/ + 0 /*PyObject *locals*/ + ); + if (!py_frame) goto bad; + py_frame->f_lineno = py_line; + PyTraceBack_Here(py_frame); +bad: + Py_XDECREF(py_code); + Py_XDECREF(py_frame); +} + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + while (t->p) { + #if PY_MAJOR_VERSION < 3 + if (t->is_unicode) { + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); + } else if (t->intern) { + *t->p = PyString_InternFromString(t->s); + } else { + *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + } + #else /* Python 3+ has unicode identifiers */ + if (t->is_unicode | t->is_str) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { + *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); + } else { + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + } + } else { + *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); + } + #endif + if (!*t->p) + return -1; + ++t; + } + return 0; +} + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) { + return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str)); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) { + Py_ssize_t ignore; + return __Pyx_PyObject_AsStringAndSize(o, &ignore); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) { +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT + if ( +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + __Pyx_sys_getdefaultencoding_not_ascii && +#endif + PyUnicode_Check(o)) { +#if PY_VERSION_HEX < 0x03030000 + char* defenc_c; + PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL); + if (!defenc) return NULL; + defenc_c = PyBytes_AS_STRING(defenc); +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + { + char* end = defenc_c + PyBytes_GET_SIZE(defenc); + char* c; + for (c = defenc_c; c < end; c++) { + if ((unsigned char) (*c) >= 128) { + PyUnicode_AsASCIIString(o); + return NULL; + } + } + } +#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/ + *length = PyBytes_GET_SIZE(defenc); + return defenc_c; +#else /* PY_VERSION_HEX < 0x03030000 */ + if (PyUnicode_READY(o) == -1) return NULL; +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + if (PyUnicode_IS_ASCII(o)) { + *length = PyUnicode_GET_DATA_SIZE(o); + return PyUnicode_AsUTF8(o); + } else { + PyUnicode_AsASCIIString(o); + return NULL; + } +#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ + return PyUnicode_AsUTF8AndSize(o, length); +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ +#endif /* PY_VERSION_HEX < 0x03030000 */ + } else +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */ +#if !CYTHON_COMPILING_IN_PYPY +#if PY_VERSION_HEX >= 0x02060000 + if (PyByteArray_Check(o)) { + *length = PyByteArray_GET_SIZE(o); + return PyByteArray_AS_STRING(o); + } else +#endif +#endif + { + char* result; + int r = PyBytes_AsStringAndSize(o, &result, length); + if (unlikely(r < 0)) { + return NULL; + } else { + return result; + } + } +} +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { + int is_true = x == Py_True; + if (is_true | (x == Py_False) | (x == Py_None)) return is_true; + else return PyObject_IsTrue(x); +} +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) { + PyNumberMethods *m; + const char *name = NULL; + PyObject *res = NULL; +#if PY_MAJOR_VERSION < 3 + if (PyInt_Check(x) || PyLong_Check(x)) +#else + if (PyLong_Check(x)) +#endif + return Py_INCREF(x), x; + m = Py_TYPE(x)->tp_as_number; +#if PY_MAJOR_VERSION < 3 + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Int(x); + } + else if (m && m->nb_long) { + name = "long"; + res = PyNumber_Long(x); + } +#else + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Long(x); + } +#endif + if (res) { +#if PY_MAJOR_VERSION < 3 + if (!PyInt_Check(res) && !PyLong_Check(res)) { +#else + if (!PyLong_Check(res)) { +#endif + PyErr_Format(PyExc_TypeError, + "__%.4s__ returned non-%.4s (type %.200s)", + name, name, Py_TYPE(res)->tp_name); + Py_DECREF(res); + return NULL; + } + } + else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, + "an integer is required"); + } + return res; +} +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) { + Py_ssize_t ival; + PyObject *x; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_CheckExact(b))) + return PyInt_AS_LONG(b); +#endif + if (likely(PyLong_CheckExact(b))) { + #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + switch (Py_SIZE(b)) { + case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0]; + case 0: return 0; + case 1: return ((PyLongObject*)b)->ob_digit[0]; + } + #endif + #endif + #if PY_VERSION_HEX < 0x02060000 + return PyInt_AsSsize_t(b); + #else + return PyLong_AsSsize_t(b); + #endif + } + x = PyNumber_Index(b); + if (!x) return -1; + ival = PyInt_AsSsize_t(x); + Py_DECREF(x); + return ival; +} +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) { +#if PY_VERSION_HEX < 0x02050000 + if (ival <= LONG_MAX) + return PyInt_FromLong((long)ival); + else { + unsigned char *bytes = (unsigned char *) &ival; + int one = 1; int little = (int)*(unsigned char*)&one; + return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0); + } +#else + return PyInt_FromSize_t(ival); +#endif +} + + +#endif /* Py_PYTHON_H */ diff --git a/spacy/en.pxd b/spacy/en.pxd new file mode 100644 index 000000000..1157c1192 --- /dev/null +++ b/spacy/en.pxd @@ -0,0 +1,17 @@ +from ext.sparsehash cimport dense_hash_map +from spacy.lexeme cimport StringHash +from spacy.lexeme cimport Lexeme + + +ctypedef Py_UNICODE* string_ptr +ctypedef size_t Lexeme_addr # For python interop +ctypedef Lexeme* Lexeme_ptr + + +cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES + + +cpdef Lexeme_addr lookup(unicode word) except 0 +cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0 +cdef StringHash hash_string(unicode s, size_t length) except 0 +cpdef unicode unhash(StringHash hash_value) diff --git a/spacy/en.pyx b/spacy/en.pyx new file mode 100644 index 000000000..cafcc7795 --- /dev/null +++ b/spacy/en.pyx @@ -0,0 +1,165 @@ +'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, +so that strings can be retrieved from hashes. Use 64-bit hash values and +boldly assume no collisions. +''' +from __future__ import unicode_literals + +from libc.stdlib cimport malloc, calloc, free +from libc.stdint cimport uint64_t + +from spacy.lexeme cimport Lexeme +from ext.murmurhash cimport MurmurHash64A +from ext.murmurhash cimport MurmurHash64B + + +STRINGS = {} +LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() +LEXEMES.set_empty_key(0) + + +cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) + + +cpdef Lexeme_addr lookup(unicode string) except 0: + '''.. function:: enumerate(sequence[, start=0]) + Fetch a Lexeme representing a word string. If the word has not been seen, + construct one, splitting off any attached punctuation or clitics. A + reference to BLANK_WORD is returned for the empty string. + + To specify the boundaries of the word if it has not been seen, use lookup_chunk. + ''' + if string == '': + return &BLANK_WORD + cdef size_t length = len(string) + cdef StringHash hashed = hash_string(string, length) + cdef Lexeme* word_ptr = LEXEMES[hashed] + cdef size_t n + if word_ptr == NULL: + word_ptr = _add(hashed, string, _find_split(string, length), length) + return word_ptr + + +cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: + '''Fetch a Lexeme representing a word string. If the word has not been seen, + construct one, given the specified start and end indices. A negative index + significes 0 for start, and the string length for end --- i.e. the string + will not be sliced if start == -1 and end == -1. + + A reference to BLANK_WORD is returned for the empty string. + ''' + if string == '': + return &BLANK_WORD + cdef size_t length = len(string) + cdef StringHash hashed = hash_string(string, length) + cdef Lexeme* chunk_ptr = LEXEMES[hashed] + if chunk_ptr == NULL: + chunk_ptr = _add(hashed, string, start, length) + return chunk_ptr + + +cdef StringHash hash_string(unicode s, size_t length) except 0: + '''Hash unicode with MurmurHash64A''' + assert length + return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) + + +cpdef unicode unhash(StringHash hash_value): + '''Fetch a string from the reverse index, given its hash value.''' + cdef string_ptr string = STRINGS[hash_value] + if string == NULL: + raise ValueError(hash_value) + + return string + + +cdef unicode normalize_word_string(unicode word): + '''Return a normalized version of the word, mapping: + - 4 digit strings into !YEAR + - Other digit strings into !DIGITS + - All other strings into lower-case + ''' + cdef unicode s + if word.isdigit() and len(word) == 4: + return '!YEAR' + elif word[0].isdigit(): + return '!DIGITS' + else: + return word.lower() + + +cpdef unicode _substr(unicode string, int start, int end, size_t length): + if end >= length: + end = -1 + if start >= length: + start = 0 + if start <= 0 and end < 0: + return string + elif start < 0: + start = 0 + elif end < 0: + end = length + return string[start:end] + + +cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: + assert string + assert split <= length + word = _init_lexeme(string, hashed, split, length) + LEXEMES[hashed] = word + STRINGS[hashed] = string + return word + + +cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, + int split, size_t length) except NULL: + assert split <= length + cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + + word.first = (string[0] if string else 0) + word.sic = hashed + + cdef unicode tail_string + cdef unicode lex + if split != 0 and split < length: + lex = _substr(string, 0, split, length) + tail_string = _substr(string, split, length, length) + else: + lex = string + tail_string = '' + assert lex + cdef unicode normed = normalize_word_string(lex) + cdef unicode last3 = _substr(string, length - 3, length, length) + + assert normed + assert len(normed) + + word.lex = hash_string(lex, len(lex)) + word.normed = hash_string(normed, len(normed)) + word.last3 = hash_string(last3, len(last3)) + + STRINGS[word.lex] = lex + STRINGS[word.normed] = normed + STRINGS[word.last3] = last3 + + # These are loaded later + word.prob = 0 + word.cluster = 0 + word.oft_upper = False + word.oft_title = False + + # Now recurse, and deal with the tail + if tail_string: + word.tail = lookup(tail_string) + return word + + +cdef size_t _find_split(unicode word, size_t length): + cdef size_t i = 0 + if word[0].isalnum(): + while i < length and word[i].isalnum(): + i += 1 + else: + # Split off a punctuation character, or a sequence of the same punctuation character + while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): + i += 1 + return i diff --git a/spacy/lexeme.cpp b/spacy/lexeme.cpp new file mode 100644 index 000000000..72140c838 --- /dev/null +++ b/spacy/lexeme.cpp @@ -0,0 +1,2433 @@ +/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ + +#define PY_SSIZE_T_CLEAN +#ifndef CYTHON_USE_PYLONG_INTERNALS +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 0 +#else +#include "pyconfig.h" +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 1 +#else +#define CYTHON_USE_PYLONG_INTERNALS 0 +#endif +#endif +#endif +#include "Python.h" +#ifndef Py_PYTHON_H + #error Python headers needed to compile C extensions, please install development version of Python. +#elif PY_VERSION_HEX < 0x02040000 + #error Cython requires Python 2.4+. +#else +#define CYTHON_ABI "0_20_1" +#include /* For offsetof */ +#ifndef offsetof +#define offsetof(type, member) ( (size_t) & ((type*)0) -> member ) +#endif +#if !defined(WIN32) && !defined(MS_WINDOWS) + #ifndef __stdcall + #define __stdcall + #endif + #ifndef __cdecl + #define __cdecl + #endif + #ifndef __fastcall + #define __fastcall + #endif +#endif +#ifndef DL_IMPORT + #define DL_IMPORT(t) t +#endif +#ifndef DL_EXPORT + #define DL_EXPORT(t) t +#endif +#ifndef PY_LONG_LONG + #define PY_LONG_LONG LONG_LONG +#endif +#ifndef Py_HUGE_VAL + #define Py_HUGE_VAL HUGE_VAL +#endif +#ifdef PYPY_VERSION +#define CYTHON_COMPILING_IN_PYPY 1 +#define CYTHON_COMPILING_IN_CPYTHON 0 +#else +#define CYTHON_COMPILING_IN_PYPY 0 +#define CYTHON_COMPILING_IN_CPYTHON 1 +#endif +#if CYTHON_COMPILING_IN_PYPY +#define Py_OptimizeFlag 0 +#endif +#if PY_VERSION_HEX < 0x02050000 + typedef int Py_ssize_t; + #define PY_SSIZE_T_MAX INT_MAX + #define PY_SSIZE_T_MIN INT_MIN + #define PY_FORMAT_SIZE_T "" + #define CYTHON_FORMAT_SSIZE_T "" + #define PyInt_FromSsize_t(z) PyInt_FromLong(z) + #define PyInt_AsSsize_t(o) __Pyx_PyInt_As_int(o) + #define PyNumber_Index(o) ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \ + (PyErr_Format(PyExc_TypeError, \ + "expected index value, got %.200s", Py_TYPE(o)->tp_name), \ + (PyObject*)0)) + #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \ + !PyComplex_Check(o)) + #define PyIndex_Check __Pyx_PyIndex_Check + #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message) + #define __PYX_BUILD_PY_SSIZE_T "i" +#else + #define __PYX_BUILD_PY_SSIZE_T "n" + #define CYTHON_FORMAT_SSIZE_T "z" + #define __Pyx_PyIndex_Check PyIndex_Check +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt) + #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) + #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size) + #define PyVarObject_HEAD_INIT(type, size) \ + PyObject_HEAD_INIT(type) size, + #define PyType_Modified(t) + typedef struct { + void *buf; + PyObject *obj; + Py_ssize_t len; + Py_ssize_t itemsize; + int readonly; + int ndim; + char *format; + Py_ssize_t *shape; + Py_ssize_t *strides; + Py_ssize_t *suboffsets; + void *internal; + } Py_buffer; + #define PyBUF_SIMPLE 0 + #define PyBUF_WRITABLE 0x0001 + #define PyBUF_FORMAT 0x0004 + #define PyBUF_ND 0x0008 + #define PyBUF_STRIDES (0x0010 | PyBUF_ND) + #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES) + #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES) + #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES) + #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES) + #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE) + #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE) + typedef int (*getbufferproc)(PyObject *, Py_buffer *, int); + typedef void (*releasebufferproc)(PyObject *, Py_buffer *); +#endif +#if PY_MAJOR_VERSION < 3 + #define __Pyx_BUILTIN_MODULE_NAME "__builtin__" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyClass_Type +#else + #define __Pyx_BUILTIN_MODULE_NAME "builtins" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyType_Type +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict") +#endif +#if PY_MAJOR_VERSION >= 3 + #define Py_TPFLAGS_CHECKTYPES 0 + #define Py_TPFLAGS_HAVE_INDEX 0 +#endif +#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3) + #define Py_TPFLAGS_HAVE_NEWBUFFER 0 +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_TPFLAGS_HAVE_VERSION_TAG 0 +#endif +#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT) + #define Py_TPFLAGS_IS_ABSTRACT 0 +#endif +#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE) + #define Py_TPFLAGS_HAVE_FINALIZE 0 +#endif +#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND) + #define CYTHON_PEP393_ENABLED 1 + #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ? \ + 0 : _PyUnicode_Ready((PyObject *)(op))) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i) + #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u) + #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) + #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) +#else + #define CYTHON_PEP393_ENABLED 0 + #define __Pyx_PyUnicode_READY(op) (0) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i])) + #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE)) + #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u)) + #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i])) +#endif +#if CYTHON_COMPILING_IN_PYPY + #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b) +#else + #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \ + PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b)) +#endif +#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) +#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b) +#else + #define __Pyx_PyString_Format(a, b) PyString_Format(a, b) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBaseString_Type PyUnicode_Type + #define PyStringObject PyUnicodeObject + #define PyString_Type PyUnicode_Type + #define PyString_Check PyUnicode_Check + #define PyString_CheckExact PyUnicode_CheckExact +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyBytesObject PyStringObject + #define PyBytes_Type PyString_Type + #define PyBytes_Check PyString_Check + #define PyBytes_CheckExact PyString_CheckExact + #define PyBytes_FromString PyString_FromString + #define PyBytes_FromStringAndSize PyString_FromStringAndSize + #define PyBytes_FromFormat PyString_FromFormat + #define PyBytes_DecodeEscape PyString_DecodeEscape + #define PyBytes_AsString PyString_AsString + #define PyBytes_AsStringAndSize PyString_AsStringAndSize + #define PyBytes_Size PyString_Size + #define PyBytes_AS_STRING PyString_AS_STRING + #define PyBytes_GET_SIZE PyString_GET_SIZE + #define PyBytes_Repr PyString_Repr + #define PyBytes_Concat PyString_Concat + #define PyBytes_ConcatAndDel PyString_ConcatAndDel +#endif +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) + #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) +#else + #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \ + PyString_Check(obj) || PyUnicode_Check(obj)) + #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj)) +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PySet_Check(obj) PyObject_TypeCheck(obj, &PySet_Type) + #define PyFrozenSet_Check(obj) PyObject_TypeCheck(obj, &PyFrozenSet_Type) +#endif +#ifndef PySet_CheckExact + #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) +#endif +#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type) +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask + #define PyNumber_Int PyNumber_Long +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBoolObject PyLongObject +#endif +#if PY_VERSION_HEX < 0x030200A4 + typedef long Py_hash_t; + #define __Pyx_PyInt_FromHash_t PyInt_FromLong + #define __Pyx_PyInt_AsHash_t PyInt_AsLong +#else + #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t + #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t +#endif +#if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300) + #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value) + #define __Pyx_PySequence_DelSlice(obj, a, b) PySequence_DelSlice(obj, a, b) +#else + #define __Pyx_PySequence_GetSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), (PyObject*)0) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_GetSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object is unsliceable", (obj)->ob_type->tp_name), (PyObject*)0))) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_SetSlice(obj, a, b, value)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice assignment", (obj)->ob_type->tp_name), -1))) + #define __Pyx_PySequence_DelSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1))) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),((char *)(n))) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),((char *)(n))) +#else + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),(n)) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),(n)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_NAMESTR(n) ((char *)(n)) + #define __Pyx_DOCSTR(n) ((char *)(n)) +#else + #define __Pyx_NAMESTR(n) (n) + #define __Pyx_DOCSTR(n) (n) +#endif +#ifndef CYTHON_INLINE + #if defined(__GNUC__) + #define CYTHON_INLINE __inline__ + #elif defined(_MSC_VER) + #define CYTHON_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_INLINE inline + #else + #define CYTHON_INLINE + #endif +#endif +#ifndef CYTHON_RESTRICT + #if defined(__GNUC__) + #define CYTHON_RESTRICT __restrict__ + #elif defined(_MSC_VER) && _MSC_VER >= 1400 + #define CYTHON_RESTRICT __restrict + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_RESTRICT restrict + #else + #define CYTHON_RESTRICT + #endif +#endif +#ifdef NAN +#define __PYX_NAN() ((float) NAN) +#else +static CYTHON_INLINE float __PYX_NAN() { + /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and + a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is + a quiet NaN. */ + float value; + memset(&value, 0xFF, sizeof(value)); + return value; +} +#endif + + +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y) +#else + #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y) +#endif + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#if defined(WIN32) || defined(MS_WINDOWS) +#define _USE_MATH_DEFINES +#endif +#include +#define __PYX_HAVE__spacy__lexeme +#define __PYX_HAVE_API__spacy__lexeme +#include "stdint.h" +#ifdef _OPENMP +#include +#endif /* _OPENMP */ + +#ifdef PYREX_WITHOUT_ASSERTIONS +#define CYTHON_WITHOUT_ASSERTIONS +#endif + +#ifndef CYTHON_UNUSED +# if defined(__GNUC__) +# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +#endif +typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding; + const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/ + +#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0 +#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0 +#define __PYX_DEFAULT_STRING_ENCODING "" +#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString +#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#define __Pyx_fits_Py_ssize_t(v, type, is_signed) ( \ + (sizeof(type) < sizeof(Py_ssize_t)) || \ + (sizeof(type) > sizeof(Py_ssize_t) && \ + likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX) && \ + (!is_signed || likely(v > (type)PY_SSIZE_T_MIN || \ + v == (type)PY_SSIZE_T_MIN))) || \ + (sizeof(type) == sizeof(Py_ssize_t) && \ + (is_signed || likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX))) ) +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*); +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length); +#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s)) +#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l) +#define __Pyx_PyBytes_FromString PyBytes_FromString +#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*); +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#else + #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize +#endif +#define __Pyx_PyObject_AsSString(s) ((signed char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_FromUString(s) __Pyx_PyObject_FromString((char*)s) +#define __Pyx_PyBytes_FromUString(s) __Pyx_PyBytes_FromString((char*)s) +#define __Pyx_PyByteArray_FromUString(s) __Pyx_PyByteArray_FromString((char*)s) +#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s) +#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s) +#if PY_MAJOR_VERSION < 3 +static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) +{ + const Py_UNICODE *u_end = u; + while (*u_end++) ; + return u_end - u - 1; +} +#else +#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen +#endif +#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u)) +#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode +#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode +#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None) +#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x); +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*); +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t); +#if CYTHON_COMPILING_IN_CPYTHON +#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x)) +#else +#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x) +#endif +#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x)) +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII +static int __Pyx_sys_getdefaultencoding_not_ascii; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + PyObject* ascii_chars_u = NULL; + PyObject* ascii_chars_b = NULL; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) { + __Pyx_sys_getdefaultencoding_not_ascii = 0; + } else { + const char* default_encoding_c = PyBytes_AS_STRING(default_encoding); + char ascii_chars[128]; + int c; + for (c = 0; c < 128; c++) { + ascii_chars[c] = c; + } + __Pyx_sys_getdefaultencoding_not_ascii = 1; + ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL); + if (ascii_chars_u == NULL) goto bad; + ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL); + if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) { + PyErr_Format( + PyExc_ValueError, + "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.", + default_encoding_c); + goto bad; + } + } + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return -1; +} +#endif +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3 +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL) +#else +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL) +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT +static char* __PYX_DEFAULT_STRING_ENCODING; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + char* default_encoding_c; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + default_encoding_c = PyBytes_AS_STRING(default_encoding); + __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c)); + strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c); + Py_DECREF(sys); + Py_DECREF(default_encoding); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + return -1; +} +#endif +#endif + + +#ifdef __GNUC__ + /* Test for GCC > 2.95 */ + #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) + #else /* __GNUC__ > 2 ... */ + #define likely(x) (x) + #define unlikely(x) (x) + #endif /* __GNUC__ > 2 ... */ +#else /* __GNUC__ */ + #define likely(x) (x) + #define unlikely(x) (x) +#endif /* __GNUC__ */ + +static PyObject *__pyx_m; +static PyObject *__pyx_d; +static PyObject *__pyx_b; +static PyObject *__pyx_empty_tuple; +static PyObject *__pyx_empty_bytes; +static int __pyx_lineno; +static int __pyx_clineno = 0; +static const char * __pyx_cfilenm= __FILE__; +static const char *__pyx_filename; + + +static const char *__pyx_f[] = { + "lexeme.pyx", +}; + +/* "spacy/lexeme.pxd":4 + * + * + * ctypedef int ClusterID # <<<<<<<<<<<<<< + * ctypedef uint64_t StringHash + * + */ +typedef int __pyx_t_5spacy_6lexeme_ClusterID; + +/* "spacy/lexeme.pxd":5 + * + * ctypedef int ClusterID + * ctypedef uint64_t StringHash # <<<<<<<<<<<<<< + * + * + */ +typedef uint64_t __pyx_t_5spacy_6lexeme_StringHash; + +/*--- Type declarations ---*/ +struct __pyx_t_5spacy_6lexeme_Lexeme; + +/* "spacy/lexeme.pxd":27 + * # over the Lexeme, via: + * # for field in range(LexAttr.n): get_attr(Lexeme*, field) + * cdef enum HashFields: # <<<<<<<<<<<<<< + * sic + * lex + */ +enum __pyx_t_5spacy_6lexeme_HashFields { + __pyx_e_5spacy_6lexeme_sic, + __pyx_e_5spacy_6lexeme_lex, + __pyx_e_5spacy_6lexeme_normed, + __pyx_e_5spacy_6lexeme_cluster, + __pyx_e_5spacy_6lexeme_n +}; + +/* "spacy/lexeme.pxd":8 + * + * + * cdef struct Lexeme: # <<<<<<<<<<<<<< + * StringHash sic # Hash of the original string + * StringHash lex # Hash of the word, with punctuation and clitics split off + */ +struct __pyx_t_5spacy_6lexeme_Lexeme { + __pyx_t_5spacy_6lexeme_StringHash sic; + __pyx_t_5spacy_6lexeme_StringHash lex; + __pyx_t_5spacy_6lexeme_StringHash normed; + __pyx_t_5spacy_6lexeme_StringHash last3; + Py_UNICODE first; + double prob; + __pyx_t_5spacy_6lexeme_ClusterID cluster; + int oft_upper; + int oft_title; + struct __pyx_t_5spacy_6lexeme_Lexeme *tail; +}; +#ifndef CYTHON_REFNANNY + #define CYTHON_REFNANNY 0 +#endif +#if CYTHON_REFNANNY + typedef struct { + void (*INCREF)(void*, PyObject*, int); + void (*DECREF)(void*, PyObject*, int); + void (*GOTREF)(void*, PyObject*, int); + void (*GIVEREF)(void*, PyObject*, int); + void* (*SetupContext)(const char*, int, const char*); + void (*FinishContext)(void**); + } __Pyx_RefNannyAPIStruct; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); /*proto*/ + #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL; +#ifdef WITH_THREAD + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + if (acquire_gil) { \ + PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure(); \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + PyGILState_Release(__pyx_gilstate_save); \ + } else { \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + } +#else + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__) +#endif + #define __Pyx_RefNannyFinishContext() \ + __Pyx_RefNanny->FinishContext(&__pyx_refnanny) + #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0) + #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0) + #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0) + #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0) +#else + #define __Pyx_RefNannyDeclarations + #define __Pyx_RefNannySetupContext(name, acquire_gil) + #define __Pyx_RefNannyFinishContext() + #define __Pyx_INCREF(r) Py_INCREF(r) + #define __Pyx_DECREF(r) Py_DECREF(r) + #define __Pyx_GOTREF(r) + #define __Pyx_GIVEREF(r) + #define __Pyx_XINCREF(r) Py_XINCREF(r) + #define __Pyx_XDECREF(r) Py_XDECREF(r) + #define __Pyx_XGOTREF(r) + #define __Pyx_XGIVEREF(r) +#endif /* CYTHON_REFNANNY */ +#define __Pyx_XDECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_XDECREF(tmp); \ + } while (0) +#define __Pyx_DECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_DECREF(tmp); \ + } while (0) +#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0) +#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0) + +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *); + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_uint64_t(uint64_t value); + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value); + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); + +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *); + +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); + +static int __Pyx_check_binary_version(void); + +typedef struct { + int code_line; + PyCodeObject* code_object; +} __Pyx_CodeObjectCacheEntry; +struct __Pyx_CodeObjectCache { + int count; + int max_count; + __Pyx_CodeObjectCacheEntry* entries; +}; +static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL}; +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line); +static PyCodeObject *__pyx_find_code_object(int code_line); +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object); + +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename); /*proto*/ + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ + + +/* Module declarations from 'libc.stdint' */ + +/* Module declarations from 'spacy.lexeme' */ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_sic_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_lex_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static __pyx_t_5spacy_6lexeme_ClusterID __pyx_f_5spacy_6lexeme_cluster_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static Py_UNICODE __pyx_f_5spacy_6lexeme_first_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static double __pyx_f_5spacy_6lexeme_prob_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_last3_of(size_t, int __pyx_skip_dispatch); /*proto*/ +static int __pyx_f_5spacy_6lexeme_is_oft_upper(size_t, int __pyx_skip_dispatch); /*proto*/ +static int __pyx_f_5spacy_6lexeme_is_oft_title(size_t, int __pyx_skip_dispatch); /*proto*/ +#define __Pyx_MODULE_NAME "spacy.lexeme" +int __pyx_module_is_main_spacy__lexeme = 0; + +/* Implementation of 'spacy.lexeme' */ +static PyObject *__pyx_pf_5spacy_6lexeme_sic_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_2lex_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_4cluster_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_6first_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_8prob_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_10last3_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_12is_oft_upper(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static PyObject *__pyx_pf_5spacy_6lexeme_14is_oft_title(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id); /* proto */ +static char __pyx_k_main[] = "__main__"; +static char __pyx_k_test[] = "__test__"; +static char __pyx_k_sic_of_line_7[] = "sic_of (line 7)"; +static char __pyx_k_lex_of_line_19[] = "lex_of (line 19)"; +static char __pyx_k_prob_of_line_63[] = "prob_of (line 63)"; +static char __pyx_k_first_of_line_52[] = "first_of (line 52)"; +static char __pyx_k_last3_of_line_76[] = "last3_of (line 76)"; +static char __pyx_k_cluster_of_line_33[] = "cluster_of (line 33)"; +static char __pyx_k_is_oft_upper_line_87[] = "is_oft_upper (line 87)"; +static char __pyx_k_is_oft_title_line_102[] = "is_oft_title (line 102)"; +static char __pyx_k_Access_the_cluster_field_of_the[] = "Access the `cluster' field of the Lexeme pointed to by lex_id, which\n gives an integer representation of the cluster ID of the word, \n which should be understood as a binary address:\n\n >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')\n >>> token_ids = [lookup(s) for s in strings]\n >>> clusters = [cluster_of(t) for t in token_ids]\n >>> print [\"{0:b\"} % cluster_of(t) for t in token_ids]\n [\"100111110110\", \"100111100100\", \"01010111011001\", \"100111110110\"]\n\n The clusterings are unideal, but often slightly useful.\n \"pineapple\" and \"apple\" share a long prefix, indicating a similar meaning,\n while \"dapple\" is totally different. On the other hand, \"scalable\" receives\n the same cluster ID as \"pineapple\", which is not what we'd like.\n "; +static char __pyx_k_Accessors_for_Lexeme_properties[] = "Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.\nMostly useful from Python-space. From Cython-space, you can just cast to\nLexeme* yourself.\n"; +static char __pyx_k_Access_the_first_field_of_the_Le[] = "Access the `first' field of the Lexeme pointed to by lex_id, which\n stores the first character of the lex string of the word.\n\n >>> lex_id = lookup(u'Hello')\n >>> unhash(first_of(lex_id))\n u'H'\n "; +static char __pyx_k_Access_the_last3_field_of_the_Le[] = "Access the `last3' field of the Lexeme pointed to by lex_id, which stores\n the hash of the last three characters of the word:\n\n >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]\n >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]\n [u'llo', u'!']\n "; +static char __pyx_k_Access_the_lex_field_of_the_Lexe[] = "Access the `lex' field of the Lexeme pointed to by lex_id.\n\n The lex field is the hash of the string you would expect to get back from\n a standard tokenizer, i.e. the word with punctuation and other non-whitespace\n delimited tokens split off. The other fields refer to properties of the\n string that the lex field stores a hash of, except sic and tail.\n\n >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]\n [u'Hi', u'!', u'world']\n "; +static char __pyx_k_Access_the_oft_upper_field_of_th[] = "Access the `oft_upper' field of the Lexeme pointed to by lex_id, which\n stores whether the lowered version of the string hashed by `lex' is found\n in all-upper case frequently in a large sample of text. Users are free\n to load different data, by default we use a sample from Wikipedia, with\n a threshold of 0.95, picked to maximize mutual information for POS tagging.\n\n >>> is_oft_upper(lookup(u'abc'))\n True\n >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer\n True\n "; +static char __pyx_k_Access_the_prob_field_of_the_Lex[] = "Access the `prob' field of the Lexeme pointed to by lex_id, which stores\n the smoothed unigram log probability of the word, as estimated from a large\n text corpus. By default, probabilities are based on counts from Gigaword,\n smoothed using Knesser-Ney; but any probabilities file can be supplied to\n load_probs.\n \n >>> prob_of(lookup(u'world'))\n -20.10340371976182\n "; +static char __pyx_k_Access_the_sic_field_of_the_Lexe[] = "Access the `sic' field of the Lexeme pointed to by lex_id.\n \n The sic field stores the hash of the whitespace-delimited string-chunk used to\n construct the Lexeme.\n \n >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]\n [u'Hi!', u'', u'world]\n "; +static char __pyx_k_Access_the_oft_upper_field_of_th_2[] = "Access the `oft_upper' field of the Lexeme pointed to by lex_id, which\n stores whether the lowered version of the string hashed by `lex' is found\n title-cased frequently in a large sample of text. Users are free\n to load different data, by default we use a sample from Wikipedia, with\n a threshold of 0.3, picked to maximize mutual information for POS tagging.\n\n >>> is_oft_title(lookup(u'marcus'))\n True\n >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value\n True\n "; +static PyObject *__pyx_kp_u_Access_the_cluster_field_of_the; +static PyObject *__pyx_kp_u_Access_the_first_field_of_the_Le; +static PyObject *__pyx_kp_u_Access_the_last3_field_of_the_Le; +static PyObject *__pyx_kp_u_Access_the_lex_field_of_the_Lexe; +static PyObject *__pyx_kp_u_Access_the_oft_upper_field_of_th; +static PyObject *__pyx_kp_u_Access_the_oft_upper_field_of_th_2; +static PyObject *__pyx_kp_u_Access_the_prob_field_of_the_Lex; +static PyObject *__pyx_kp_u_Access_the_sic_field_of_the_Lexe; +static PyObject *__pyx_kp_u_cluster_of_line_33; +static PyObject *__pyx_kp_u_first_of_line_52; +static PyObject *__pyx_kp_u_is_oft_title_line_102; +static PyObject *__pyx_kp_u_is_oft_upper_line_87; +static PyObject *__pyx_kp_u_last3_of_line_76; +static PyObject *__pyx_kp_u_lex_of_line_19; +static PyObject *__pyx_n_s_main; +static PyObject *__pyx_kp_u_prob_of_line_63; +static PyObject *__pyx_kp_u_sic_of_line_7; +static PyObject *__pyx_n_s_test; + +/* "spacy/lexeme.pyx":7 + * + * + * cpdef StringHash sic_of(size_t lex_id) except 0: # <<<<<<<<<<<<<< + * '''Access the `sic' field of the Lexeme pointed to by lex_id. + * + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_1sic_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_sic_of(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + __pyx_t_5spacy_6lexeme_StringHash __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("sic_of", 0); + + /* "spacy/lexeme.pyx":16 + * [u'Hi!', u'', u'world] + * ''' + * return (lex_id).sic # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->sic; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":7 + * + * + * cpdef StringHash sic_of(size_t lex_id) except 0: # <<<<<<<<<<<<<< + * '''Access the `sic' field of the Lexeme pointed to by lex_id. + * + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_1sic_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_sic_of[] = "Access the `sic' field of the Lexeme pointed to by lex_id.\n \n The sic field stores the hash of the whitespace-delimited string-chunk used to\n construct the Lexeme.\n \n >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]\n [u'Hi!', u'', u'world]\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_1sic_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("sic_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.sic_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_sic_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_sic_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("sic_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_6lexeme_sic_of(__pyx_v_lex_id, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.lexeme.sic_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":19 + * + * + * cpdef StringHash lex_of(size_t lex_id) except 0: # <<<<<<<<<<<<<< + * '''Access the `lex' field of the Lexeme pointed to by lex_id. + * + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_3lex_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_lex_of(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + __pyx_t_5spacy_6lexeme_StringHash __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("lex_of", 0); + + /* "spacy/lexeme.pyx":30 + * [u'Hi', u'!', u'world'] + * ''' + * return (lex_id).lex # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->lex; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":19 + * + * + * cpdef StringHash lex_of(size_t lex_id) except 0: # <<<<<<<<<<<<<< + * '''Access the `lex' field of the Lexeme pointed to by lex_id. + * + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_3lex_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_2lex_of[] = "Access the `lex' field of the Lexeme pointed to by lex_id.\n\n The lex field is the hash of the string you would expect to get back from\n a standard tokenizer, i.e. the word with punctuation and other non-whitespace\n delimited tokens split off. The other fields refer to properties of the\n string that the lex field stores a hash of, except sic and tail.\n\n >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]\n [u'Hi', u'!', u'world']\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_3lex_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("lex_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.lex_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_2lex_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_2lex_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("lex_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_6lexeme_lex_of(__pyx_v_lex_id, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.lexeme.lex_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":33 + * + * + * cpdef ClusterID cluster_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `cluster' field of the Lexeme pointed to by lex_id, which + * gives an integer representation of the cluster ID of the word, + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_5cluster_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static __pyx_t_5spacy_6lexeme_ClusterID __pyx_f_5spacy_6lexeme_cluster_of(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + __pyx_t_5spacy_6lexeme_ClusterID __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("cluster_of", 0); + + /* "spacy/lexeme.pyx":49 + * the same cluster ID as "pineapple", which is not what we'd like. + * ''' + * return (lex_id).cluster # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->cluster; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":33 + * + * + * cpdef ClusterID cluster_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `cluster' field of the Lexeme pointed to by lex_id, which + * gives an integer representation of the cluster ID of the word, + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_5cluster_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_4cluster_of[] = "Access the `cluster' field of the Lexeme pointed to by lex_id, which\n gives an integer representation of the cluster ID of the word, \n which should be understood as a binary address:\n\n >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')\n >>> token_ids = [lookup(s) for s in strings]\n >>> clusters = [cluster_of(t) for t in token_ids]\n >>> print [\"{0:b\"} % cluster_of(t) for t in token_ids]\n [\"100111110110\", \"100111100100\", \"01010111011001\", \"100111110110\"]\n\n The clusterings are unideal, but often slightly useful.\n \"pineapple\" and \"apple\" share a long prefix, indicating a similar meaning,\n while \"dapple\" is totally different. On the other hand, \"scalable\" receives\n the same cluster ID as \"pineapple\", which is not what we'd like.\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_5cluster_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("cluster_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.cluster_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_4cluster_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_4cluster_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("cluster_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_f_5spacy_6lexeme_cluster_of(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.cluster_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":52 + * + * + * cpdef Py_UNICODE first_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `first' field of the Lexeme pointed to by lex_id, which + * stores the first character of the lex string of the word. + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_7first_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static Py_UNICODE __pyx_f_5spacy_6lexeme_first_of(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + Py_UNICODE __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("first_of", 0); + + /* "spacy/lexeme.pyx":60 + * u'H' + * ''' + * return (lex_id).first # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->first; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":52 + * + * + * cpdef Py_UNICODE first_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `first' field of the Lexeme pointed to by lex_id, which + * stores the first character of the lex string of the word. + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_7first_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_6first_of[] = "Access the `first' field of the Lexeme pointed to by lex_id, which\n stores the first character of the lex string of the word.\n\n >>> lex_id = lookup(u'Hello')\n >>> unhash(first_of(lex_id))\n u'H'\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_7first_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("first_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.first_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_6first_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_6first_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("first_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyUnicode_FromOrdinal(__pyx_f_5spacy_6lexeme_first_of(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.first_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":63 + * + * + * cpdef double prob_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores + * the smoothed unigram log probability of the word, as estimated from a large + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_9prob_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static double __pyx_f_5spacy_6lexeme_prob_of(CYTHON_UNUSED size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + double __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("prob_of", 0); + + /* function exit code */ + __pyx_r = 0; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_9prob_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_8prob_of[] = "Access the `prob' field of the Lexeme pointed to by lex_id, which stores\n the smoothed unigram log probability of the word, as estimated from a large\n text corpus. By default, probabilities are based on counts from Gigaword,\n smoothed using Knesser-Ney; but any probabilities file can be supplied to\n load_probs.\n \n >>> prob_of(lookup(u'world'))\n -20.10340371976182\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_9prob_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("prob_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.prob_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_8prob_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_8prob_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("prob_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyFloat_FromDouble(__pyx_f_5spacy_6lexeme_prob_of(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.prob_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":76 + * + * + * cpdef StringHash last3_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores + * the hash of the last three characters of the word: + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_11last3_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_6lexeme_last3_of(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + __pyx_t_5spacy_6lexeme_StringHash __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("last3_of", 0); + + /* "spacy/lexeme.pyx":84 + * [u'llo', u'!'] + * ''' + * return (lex_id).last3 # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->last3; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":76 + * + * + * cpdef StringHash last3_of(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores + * the hash of the last three characters of the word: + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_11last3_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_10last3_of[] = "Access the `last3' field of the Lexeme pointed to by lex_id, which stores\n the hash of the last three characters of the word:\n\n >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]\n >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]\n [u'llo', u'!']\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_11last3_of(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("last3_of (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 76; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.last3_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_10last3_of(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_10last3_of(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("last3_of", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_uint64_t(__pyx_f_5spacy_6lexeme_last3_of(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 76; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.last3_of", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":87 + * + * + * cpdef bint is_oft_upper(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + * stores whether the lowered version of the string hashed by `lex' is found + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_13is_oft_upper(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static int __pyx_f_5spacy_6lexeme_is_oft_upper(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_oft_upper", 0); + + /* "spacy/lexeme.pyx":99 + * True + * ''' + * return (lex_id).oft_upper # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->oft_upper; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":87 + * + * + * cpdef bint is_oft_upper(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + * stores whether the lowered version of the string hashed by `lex' is found + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_13is_oft_upper(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_12is_oft_upper[] = "Access the `oft_upper' field of the Lexeme pointed to by lex_id, which\n stores whether the lowered version of the string hashed by `lex' is found\n in all-upper case frequently in a large sample of text. Users are free\n to load different data, by default we use a sample from Wikipedia, with\n a threshold of 0.95, picked to maximize mutual information for POS tagging.\n\n >>> is_oft_upper(lookup(u'abc'))\n True\n >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer\n True\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_13is_oft_upper(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_oft_upper (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.is_oft_upper", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_12is_oft_upper(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_12is_oft_upper(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("is_oft_upper", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_f_5spacy_6lexeme_is_oft_upper(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.is_oft_upper", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/lexeme.pyx":102 + * + * + * cpdef bint is_oft_title(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + * stores whether the lowered version of the string hashed by `lex' is found + */ + +static PyObject *__pyx_pw_5spacy_6lexeme_15is_oft_title(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static int __pyx_f_5spacy_6lexeme_is_oft_title(size_t __pyx_v_lex_id, CYTHON_UNUSED int __pyx_skip_dispatch) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_oft_title", 0); + + /* "spacy/lexeme.pyx":114 + * True + * ''' + * return (lex_id).oft_title # <<<<<<<<<<<<<< + */ + __pyx_r = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_lex_id)->oft_title; + goto __pyx_L0; + + /* "spacy/lexeme.pyx":102 + * + * + * cpdef bint is_oft_title(size_t lex_id): # <<<<<<<<<<<<<< + * '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + * stores whether the lowered version of the string hashed by `lex' is found + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_6lexeme_15is_oft_title(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id); /*proto*/ +static char __pyx_doc_5spacy_6lexeme_14is_oft_title[] = "Access the `oft_upper' field of the Lexeme pointed to by lex_id, which\n stores whether the lowered version of the string hashed by `lex' is found\n title-cased frequently in a large sample of text. Users are free\n to load different data, by default we use a sample from Wikipedia, with\n a threshold of 0.3, picked to maximize mutual information for POS tagging.\n\n >>> is_oft_title(lookup(u'marcus'))\n True\n >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value\n True\n "; +static PyObject *__pyx_pw_5spacy_6lexeme_15is_oft_title(PyObject *__pyx_self, PyObject *__pyx_arg_lex_id) { + size_t __pyx_v_lex_id; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_oft_title (wrapper)", 0); + assert(__pyx_arg_lex_id); { + __pyx_v_lex_id = __Pyx_PyInt_As_size_t(__pyx_arg_lex_id); if (unlikely((__pyx_v_lex_id == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.lexeme.is_oft_title", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_6lexeme_14is_oft_title(__pyx_self, ((size_t)__pyx_v_lex_id)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_6lexeme_14is_oft_title(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_lex_id) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("is_oft_title", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_f_5spacy_6lexeme_is_oft_title(__pyx_v_lex_id, 0)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("spacy.lexeme.is_oft_title", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyMethodDef __pyx_methods[] = { + {__Pyx_NAMESTR("sic_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_1sic_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_sic_of)}, + {__Pyx_NAMESTR("lex_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_3lex_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_2lex_of)}, + {__Pyx_NAMESTR("cluster_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_5cluster_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_4cluster_of)}, + {__Pyx_NAMESTR("first_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_7first_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_6first_of)}, + {__Pyx_NAMESTR("prob_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_9prob_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_8prob_of)}, + {__Pyx_NAMESTR("last3_of"), (PyCFunction)__pyx_pw_5spacy_6lexeme_11last3_of, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_10last3_of)}, + {__Pyx_NAMESTR("is_oft_upper"), (PyCFunction)__pyx_pw_5spacy_6lexeme_13is_oft_upper, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_12is_oft_upper)}, + {__Pyx_NAMESTR("is_oft_title"), (PyCFunction)__pyx_pw_5spacy_6lexeme_15is_oft_title, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_6lexeme_14is_oft_title)}, + {0, 0, 0, 0} +}; + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef __pyx_moduledef = { + #if PY_VERSION_HEX < 0x03020000 + { PyObject_HEAD_INIT(NULL) NULL, 0, NULL }, + #else + PyModuleDef_HEAD_INIT, + #endif + __Pyx_NAMESTR("lexeme"), + __Pyx_DOCSTR(__pyx_k_Accessors_for_Lexeme_properties), /* m_doc */ + -1, /* m_size */ + __pyx_methods /* m_methods */, + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; +#endif + +static __Pyx_StringTabEntry __pyx_string_tab[] = { + {&__pyx_kp_u_Access_the_cluster_field_of_the, __pyx_k_Access_the_cluster_field_of_the, sizeof(__pyx_k_Access_the_cluster_field_of_the), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_first_field_of_the_Le, __pyx_k_Access_the_first_field_of_the_Le, sizeof(__pyx_k_Access_the_first_field_of_the_Le), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_last3_field_of_the_Le, __pyx_k_Access_the_last3_field_of_the_Le, sizeof(__pyx_k_Access_the_last3_field_of_the_Le), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_lex_field_of_the_Lexe, __pyx_k_Access_the_lex_field_of_the_Lexe, sizeof(__pyx_k_Access_the_lex_field_of_the_Lexe), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_oft_upper_field_of_th, __pyx_k_Access_the_oft_upper_field_of_th, sizeof(__pyx_k_Access_the_oft_upper_field_of_th), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_oft_upper_field_of_th_2, __pyx_k_Access_the_oft_upper_field_of_th_2, sizeof(__pyx_k_Access_the_oft_upper_field_of_th_2), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_prob_field_of_the_Lex, __pyx_k_Access_the_prob_field_of_the_Lex, sizeof(__pyx_k_Access_the_prob_field_of_the_Lex), 0, 1, 0, 0}, + {&__pyx_kp_u_Access_the_sic_field_of_the_Lexe, __pyx_k_Access_the_sic_field_of_the_Lexe, sizeof(__pyx_k_Access_the_sic_field_of_the_Lexe), 0, 1, 0, 0}, + {&__pyx_kp_u_cluster_of_line_33, __pyx_k_cluster_of_line_33, sizeof(__pyx_k_cluster_of_line_33), 0, 1, 0, 0}, + {&__pyx_kp_u_first_of_line_52, __pyx_k_first_of_line_52, sizeof(__pyx_k_first_of_line_52), 0, 1, 0, 0}, + {&__pyx_kp_u_is_oft_title_line_102, __pyx_k_is_oft_title_line_102, sizeof(__pyx_k_is_oft_title_line_102), 0, 1, 0, 0}, + {&__pyx_kp_u_is_oft_upper_line_87, __pyx_k_is_oft_upper_line_87, sizeof(__pyx_k_is_oft_upper_line_87), 0, 1, 0, 0}, + {&__pyx_kp_u_last3_of_line_76, __pyx_k_last3_of_line_76, sizeof(__pyx_k_last3_of_line_76), 0, 1, 0, 0}, + {&__pyx_kp_u_lex_of_line_19, __pyx_k_lex_of_line_19, sizeof(__pyx_k_lex_of_line_19), 0, 1, 0, 0}, + {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, + {&__pyx_kp_u_prob_of_line_63, __pyx_k_prob_of_line_63, sizeof(__pyx_k_prob_of_line_63), 0, 1, 0, 0}, + {&__pyx_kp_u_sic_of_line_7, __pyx_k_sic_of_line_7, sizeof(__pyx_k_sic_of_line_7), 0, 1, 0, 0}, + {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 0, 0} +}; +static int __Pyx_InitCachedBuiltins(void) { + return 0; +} + +static int __Pyx_InitCachedConstants(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_InitGlobals(void) { + if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + return 0; + __pyx_L1_error:; + return -1; +} + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC initlexeme(void); /*proto*/ +PyMODINIT_FUNC initlexeme(void) +#else +PyMODINIT_FUNC PyInit_lexeme(void); /*proto*/ +PyMODINIT_FUNC PyInit_lexeme(void) +#endif +{ + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannyDeclarations + #if CYTHON_REFNANNY + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); + if (!__Pyx_RefNanny) { + PyErr_Clear(); + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); + if (!__Pyx_RefNanny) + Py_FatalError("failed to import 'refnanny' module"); + } + #endif + __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_lexeme(void)", 0); + if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #ifdef __Pyx_CyFunction_USED + if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_FusedFunction_USED + if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_Generator_USED + if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + /*--- Library function declarations ---*/ + /*--- Threads initialization code ---*/ + #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS + #ifdef WITH_THREAD /* Python build with threading support? */ + PyEval_InitThreads(); + #endif + #endif + /*--- Module creation code ---*/ + #if PY_MAJOR_VERSION < 3 + __pyx_m = Py_InitModule4(__Pyx_NAMESTR("lexeme"), __pyx_methods, __Pyx_DOCSTR(__pyx_k_Accessors_for_Lexeme_properties), 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); + #else + __pyx_m = PyModule_Create(&__pyx_moduledef); + #endif + if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + Py_INCREF(__pyx_d); + __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if CYTHON_COMPILING_IN_PYPY + Py_INCREF(__pyx_b); + #endif + if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + /*--- Initialize various global constants etc. ---*/ + if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT) + if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + if (__pyx_module_is_main_spacy__lexeme) { + if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + } + #if PY_MAJOR_VERSION >= 3 + { + PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!PyDict_GetItemString(modules, "spacy.lexeme")) { + if (unlikely(PyDict_SetItemString(modules, "spacy.lexeme", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + /*--- Builtin init code ---*/ + if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Constants init code ---*/ + if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Global init code ---*/ + /*--- Variable export code ---*/ + /*--- Function export code ---*/ + /*--- Type init code ---*/ + /*--- Type import code ---*/ + /*--- Variable import code ---*/ + /*--- Function import code ---*/ + /*--- Execution code ---*/ + + /* "spacy/lexeme.pyx":1 + * '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. # <<<<<<<<<<<<<< + * Mostly useful from Python-space. From Cython-space, you can just cast to + * Lexeme* yourself. + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_sic_of_line_7, __pyx_kp_u_Access_the_sic_field_of_the_Lexe) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_lex_of_line_19, __pyx_kp_u_Access_the_lex_field_of_the_Lexe) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_cluster_of_line_33, __pyx_kp_u_Access_the_cluster_field_of_the) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_first_of_line_52, __pyx_kp_u_Access_the_first_field_of_the_Le) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_prob_of_line_63, __pyx_kp_u_Access_the_prob_field_of_the_Lex) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_last3_of_line_76, __pyx_kp_u_Access_the_last3_field_of_the_Le) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_is_oft_upper_line_87, __pyx_kp_u_Access_the_oft_upper_field_of_th) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_t_1, __pyx_kp_u_is_oft_title_line_102, __pyx_kp_u_Access_the_oft_upper_field_of_th_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + if (__pyx_m) { + __Pyx_AddTraceback("init spacy.lexeme", __pyx_clineno, __pyx_lineno, __pyx_filename); + Py_DECREF(__pyx_m); __pyx_m = 0; + } else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ImportError, "init spacy.lexeme"); + } + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + #if PY_MAJOR_VERSION < 3 + return; + #else + return __pyx_m; + #endif +} + +/* Runtime support code */ +#if CYTHON_REFNANNY +static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) { + PyObject *m = NULL, *p = NULL; + void *r = NULL; + m = PyImport_ImportModule((char *)modname); + if (!m) goto end; + p = PyObject_GetAttrString(m, (char *)"RefNannyAPI"); + if (!p) goto end; + r = PyLong_AsVoidPtr(p); +end: + Py_XDECREF(p); + Py_XDECREF(m); + return (__Pyx_RefNannyAPIStruct *)r; +} +#endif /* CYTHON_REFNANNY */ + +#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func) \ + { \ + func_type value = func(x); \ + if (sizeof(target_type) < sizeof(func_type)) { \ + if (unlikely(value != (func_type) (target_type) value)) { \ + func_type zero = 0; \ + PyErr_SetString(PyExc_OverflowError, \ + (is_unsigned && unlikely(value < zero)) ? \ + "can't convert negative value to " #target_type : \ + "value too large to convert to " #target_type); \ + return (target_type) -1; \ + } \ + } \ + return (target_type) value; \ + } + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) { + const size_t neg_one = (size_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(size_t) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + return (size_t) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + if (sizeof(size_t) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(size_t) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(size_t) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(size_t) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyLong_AsLong) + } else if (sizeof(size_t) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(size_t, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + size_t val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (size_t) -1; + } + } else { + size_t val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (size_t) -1; + val = __Pyx_PyInt_As_size_t(tmp); + Py_DECREF(tmp); + return val; + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_uint64_t(uint64_t value) { + const uint64_t neg_one = (uint64_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(uint64_t) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(uint64_t) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(uint64_t) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(uint64_t) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(uint64_t) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(uint64_t), + little, !is_unsigned); + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) { + const int neg_one = (int) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(int) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(int) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(int) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(int), + little, !is_unsigned); + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(long) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(long) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(long) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(long), + little, !is_unsigned); + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(long) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + return (long) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + if (sizeof(long) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(long) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(long) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(long) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong) + } else if (sizeof(long) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + long val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (long) -1; + } + } else { + long val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (long) -1; + val = __Pyx_PyInt_As_long(tmp); + Py_DECREF(tmp); + return val; + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + const int neg_one = (int) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + return (int) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + if (sizeof(int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(int) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(int) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong) + } else if (sizeof(int) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + int val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (int) -1; + } + } else { + int val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (int) -1; + val = __Pyx_PyInt_As_int(tmp); + Py_DECREF(tmp); + return val; + } +} + +static int __Pyx_check_binary_version(void) { + char ctversion[4], rtversion[4]; + PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); + PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); + if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) { + char message[200]; + PyOS_snprintf(message, sizeof(message), + "compiletime version %s of module '%.100s' " + "does not match runtime version %s", + ctversion, __Pyx_MODULE_NAME, rtversion); + #if PY_VERSION_HEX < 0x02050000 + return PyErr_Warn(NULL, message); + #else + return PyErr_WarnEx(NULL, message, 1); + #endif + } + return 0; +} + +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + int start = 0, mid = 0, end = count - 1; + if (end >= 0 && code_line > entries[end].code_line) { + return count; + } + while (start < end) { + mid = (start + end) / 2; + if (code_line < entries[mid].code_line) { + end = mid; + } else if (code_line > entries[mid].code_line) { + start = mid + 1; + } else { + return mid; + } + } + if (code_line <= entries[mid].code_line) { + return mid; + } else { + return mid + 1; + } +} +static PyCodeObject *__pyx_find_code_object(int code_line) { + PyCodeObject* code_object; + int pos; + if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) { + return NULL; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) { + return NULL; + } + code_object = __pyx_code_cache.entries[pos].code_object; + Py_INCREF(code_object); + return code_object; +} +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { + int pos, i; + __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries; + if (unlikely(!code_line)) { + return; + } + if (unlikely(!entries)) { + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry)); + if (likely(entries)) { + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = 64; + __pyx_code_cache.count = 1; + entries[0].code_line = code_line; + entries[0].code_object = code_object; + Py_INCREF(code_object); + } + return; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) { + PyCodeObject* tmp = entries[pos].code_object; + entries[pos].code_object = code_object; + Py_DECREF(tmp); + return; + } + if (__pyx_code_cache.count == __pyx_code_cache.max_count) { + int new_max = __pyx_code_cache.max_count + 64; + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( + __pyx_code_cache.entries, new_max*sizeof(__Pyx_CodeObjectCacheEntry)); + if (unlikely(!entries)) { + return; + } + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = new_max; + } + for (i=__pyx_code_cache.count; i>pos; i--) { + entries[i] = entries[i-1]; + } + entries[pos].code_line = code_line; + entries[pos].code_object = code_object; + __pyx_code_cache.count++; + Py_INCREF(code_object); +} + +#include "compile.h" +#include "frameobject.h" +#include "traceback.h" +static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( + const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_srcfile = 0; + PyObject *py_funcname = 0; + #if PY_MAJOR_VERSION < 3 + py_srcfile = PyString_FromString(filename); + #else + py_srcfile = PyUnicode_FromString(filename); + #endif + if (!py_srcfile) goto bad; + if (c_line) { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #else + py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #endif + } + else { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromString(funcname); + #else + py_funcname = PyUnicode_FromString(funcname); + #endif + } + if (!py_funcname) goto bad; + py_code = __Pyx_PyCode_New( + 0, /*int argcount,*/ + 0, /*int kwonlyargcount,*/ + 0, /*int nlocals,*/ + 0, /*int stacksize,*/ + 0, /*int flags,*/ + __pyx_empty_bytes, /*PyObject *code,*/ + __pyx_empty_tuple, /*PyObject *consts,*/ + __pyx_empty_tuple, /*PyObject *names,*/ + __pyx_empty_tuple, /*PyObject *varnames,*/ + __pyx_empty_tuple, /*PyObject *freevars,*/ + __pyx_empty_tuple, /*PyObject *cellvars,*/ + py_srcfile, /*PyObject *filename,*/ + py_funcname, /*PyObject *name,*/ + py_line, /*int firstlineno,*/ + __pyx_empty_bytes /*PyObject *lnotab*/ + ); + Py_DECREF(py_srcfile); + Py_DECREF(py_funcname); + return py_code; +bad: + Py_XDECREF(py_srcfile); + Py_XDECREF(py_funcname); + return NULL; +} +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_globals = 0; + PyFrameObject *py_frame = 0; + py_code = __pyx_find_code_object(c_line ? c_line : py_line); + if (!py_code) { + py_code = __Pyx_CreateCodeObjectForTraceback( + funcname, c_line, py_line, filename); + if (!py_code) goto bad; + __pyx_insert_code_object(c_line ? c_line : py_line, py_code); + } + py_globals = PyModule_GetDict(__pyx_m); + if (!py_globals) goto bad; + py_frame = PyFrame_New( + PyThreadState_GET(), /*PyThreadState *tstate,*/ + py_code, /*PyCodeObject *code,*/ + py_globals, /*PyObject *globals,*/ + 0 /*PyObject *locals*/ + ); + if (!py_frame) goto bad; + py_frame->f_lineno = py_line; + PyTraceBack_Here(py_frame); +bad: + Py_XDECREF(py_code); + Py_XDECREF(py_frame); +} + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + while (t->p) { + #if PY_MAJOR_VERSION < 3 + if (t->is_unicode) { + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); + } else if (t->intern) { + *t->p = PyString_InternFromString(t->s); + } else { + *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + } + #else /* Python 3+ has unicode identifiers */ + if (t->is_unicode | t->is_str) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { + *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); + } else { + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + } + } else { + *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); + } + #endif + if (!*t->p) + return -1; + ++t; + } + return 0; +} + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) { + return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str)); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) { + Py_ssize_t ignore; + return __Pyx_PyObject_AsStringAndSize(o, &ignore); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) { +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT + if ( +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + __Pyx_sys_getdefaultencoding_not_ascii && +#endif + PyUnicode_Check(o)) { +#if PY_VERSION_HEX < 0x03030000 + char* defenc_c; + PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL); + if (!defenc) return NULL; + defenc_c = PyBytes_AS_STRING(defenc); +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + { + char* end = defenc_c + PyBytes_GET_SIZE(defenc); + char* c; + for (c = defenc_c; c < end; c++) { + if ((unsigned char) (*c) >= 128) { + PyUnicode_AsASCIIString(o); + return NULL; + } + } + } +#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/ + *length = PyBytes_GET_SIZE(defenc); + return defenc_c; +#else /* PY_VERSION_HEX < 0x03030000 */ + if (PyUnicode_READY(o) == -1) return NULL; +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + if (PyUnicode_IS_ASCII(o)) { + *length = PyUnicode_GET_DATA_SIZE(o); + return PyUnicode_AsUTF8(o); + } else { + PyUnicode_AsASCIIString(o); + return NULL; + } +#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ + return PyUnicode_AsUTF8AndSize(o, length); +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ +#endif /* PY_VERSION_HEX < 0x03030000 */ + } else +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */ +#if !CYTHON_COMPILING_IN_PYPY +#if PY_VERSION_HEX >= 0x02060000 + if (PyByteArray_Check(o)) { + *length = PyByteArray_GET_SIZE(o); + return PyByteArray_AS_STRING(o); + } else +#endif +#endif + { + char* result; + int r = PyBytes_AsStringAndSize(o, &result, length); + if (unlikely(r < 0)) { + return NULL; + } else { + return result; + } + } +} +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { + int is_true = x == Py_True; + if (is_true | (x == Py_False) | (x == Py_None)) return is_true; + else return PyObject_IsTrue(x); +} +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) { + PyNumberMethods *m; + const char *name = NULL; + PyObject *res = NULL; +#if PY_MAJOR_VERSION < 3 + if (PyInt_Check(x) || PyLong_Check(x)) +#else + if (PyLong_Check(x)) +#endif + return Py_INCREF(x), x; + m = Py_TYPE(x)->tp_as_number; +#if PY_MAJOR_VERSION < 3 + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Int(x); + } + else if (m && m->nb_long) { + name = "long"; + res = PyNumber_Long(x); + } +#else + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Long(x); + } +#endif + if (res) { +#if PY_MAJOR_VERSION < 3 + if (!PyInt_Check(res) && !PyLong_Check(res)) { +#else + if (!PyLong_Check(res)) { +#endif + PyErr_Format(PyExc_TypeError, + "__%.4s__ returned non-%.4s (type %.200s)", + name, name, Py_TYPE(res)->tp_name); + Py_DECREF(res); + return NULL; + } + } + else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, + "an integer is required"); + } + return res; +} +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) { + Py_ssize_t ival; + PyObject *x; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_CheckExact(b))) + return PyInt_AS_LONG(b); +#endif + if (likely(PyLong_CheckExact(b))) { + #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + switch (Py_SIZE(b)) { + case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0]; + case 0: return 0; + case 1: return ((PyLongObject*)b)->ob_digit[0]; + } + #endif + #endif + #if PY_VERSION_HEX < 0x02060000 + return PyInt_AsSsize_t(b); + #else + return PyLong_AsSsize_t(b); + #endif + } + x = PyNumber_Index(b); + if (!x) return -1; + ival = PyInt_AsSsize_t(x); + Py_DECREF(x); + return ival; +} +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) { +#if PY_VERSION_HEX < 0x02050000 + if (ival <= LONG_MAX) + return PyInt_FromLong((long)ival); + else { + unsigned char *bytes = (unsigned char *) &ival; + int one = 1; int little = (int)*(unsigned char*)&one; + return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0); + } +#else + return PyInt_FromSize_t(ival); +#endif +} + + +#endif /* Py_PYTHON_H */ diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd new file mode 100644 index 000000000..906714f5a --- /dev/null +++ b/spacy/lexeme.pxd @@ -0,0 +1,35 @@ +from libc.stdint cimport uint64_t + + +ctypedef int ClusterID +ctypedef uint64_t StringHash + + +cdef struct Lexeme: + StringHash sic # Hash of the original string + StringHash lex # Hash of the word, with punctuation and clitics split off + StringHash normed # Hash of the normalized version of lex + StringHash last3 # Last 3 characters of the token + Py_UNICODE first # First character of the token + + double prob # What is the log probability of the lex value? + ClusterID cluster # Brown cluster of the token + + bint oft_upper # Is the lowered version of the lex value often in all caps? + bint oft_title # Is the lowered version of the lex value often title-cased? + Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens + + +# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which +# has a conditional to pick out the correct item. This allows safe iteration +# over the Lexeme, via: +# for field in range(LexAttr.n): get_attr(Lexeme*, field) +cdef enum HashFields: + sic + lex + normed + cluster + n + + +#cdef uint64_t get_attr(Lexeme* word, HashFields attr) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx new file mode 100644 index 000000000..a546570cc --- /dev/null +++ b/spacy/lexeme.pyx @@ -0,0 +1,114 @@ +'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. +Mostly useful from Python-space. From Cython-space, you can just cast to +Lexeme* yourself. +''' + + +cpdef StringHash sic_of(size_t lex_id) except 0: + '''Access the `sic' field of the Lexeme pointed to by lex_id. + + The sic field stores the hash of the whitespace-delimited string-chunk used to + construct the Lexeme. + + >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')] + [u'Hi!', u'', u'world] + ''' + return (lex_id).sic + + +cpdef StringHash lex_of(size_t lex_id) except 0: + '''Access the `lex' field of the Lexeme pointed to by lex_id. + + The lex field is the hash of the string you would expect to get back from + a standard tokenizer, i.e. the word with punctuation and other non-whitespace + delimited tokens split off. The other fields refer to properties of the + string that the lex field stores a hash of, except sic and tail. + + >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')] + [u'Hi', u'!', u'world'] + ''' + return (lex_id).lex + + +cpdef ClusterID cluster_of(size_t lex_id): + '''Access the `cluster' field of the Lexeme pointed to by lex_id, which + gives an integer representation of the cluster ID of the word, + which should be understood as a binary address: + + >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable') + >>> token_ids = [lookup(s) for s in strings] + >>> clusters = [cluster_of(t) for t in token_ids] + >>> print ["{0:b"} % cluster_of(t) for t in token_ids] + ["100111110110", "100111100100", "01010111011001", "100111110110"] + + The clusterings are unideal, but often slightly useful. + "pineapple" and "apple" share a long prefix, indicating a similar meaning, + while "dapple" is totally different. On the other hand, "scalable" receives + the same cluster ID as "pineapple", which is not what we'd like. + ''' + return (lex_id).cluster + + +cpdef Py_UNICODE first_of(size_t lex_id): + '''Access the `first' field of the Lexeme pointed to by lex_id, which + stores the first character of the lex string of the word. + + >>> lex_id = lookup(u'Hello') + >>> unhash(first_of(lex_id)) + u'H' + ''' + return (lex_id).first + + +cpdef double prob_of(size_t lex_id): + '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores + the smoothed unigram log probability of the word, as estimated from a large + text corpus. By default, probabilities are based on counts from Gigaword, + smoothed using Knesser-Ney; but any probabilities file can be supplied to + load_probs. + + >>> prob_of(lookup(u'world')) + -20.10340371976182 + ''' + pass + + +cpdef StringHash last3_of(size_t lex_id): + '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores + the hash of the last three characters of the word: + + >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')] + >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] + [u'llo', u'!'] + ''' + return (lex_id).last3 + + +cpdef bint is_oft_upper(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + in all-upper case frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.95, picked to maximize mutual information for POS tagging. + + >>> is_oft_upper(lookup(u'abc')) + True + >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer + True + ''' + return (lex_id).oft_upper + + +cpdef bint is_oft_title(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + title-cased frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.3, picked to maximize mutual information for POS tagging. + + >>> is_oft_title(lookup(u'marcus')) + True + >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value + True + ''' + return (lex_id).oft_title diff --git a/spacy/spacy.cpp b/spacy/spacy.cpp new file mode 100644 index 000000000..dff7c9ba8 --- /dev/null +++ b/spacy/spacy.cpp @@ -0,0 +1,2064 @@ +/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ + +#define PY_SSIZE_T_CLEAN +#ifndef CYTHON_USE_PYLONG_INTERNALS +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 0 +#else +#include "pyconfig.h" +#ifdef PYLONG_BITS_IN_DIGIT +#define CYTHON_USE_PYLONG_INTERNALS 1 +#else +#define CYTHON_USE_PYLONG_INTERNALS 0 +#endif +#endif +#endif +#include "Python.h" +#ifndef Py_PYTHON_H + #error Python headers needed to compile C extensions, please install development version of Python. +#elif PY_VERSION_HEX < 0x02040000 + #error Cython requires Python 2.4+. +#else +#define CYTHON_ABI "0_20_1" +#include /* For offsetof */ +#ifndef offsetof +#define offsetof(type, member) ( (size_t) & ((type*)0) -> member ) +#endif +#if !defined(WIN32) && !defined(MS_WINDOWS) + #ifndef __stdcall + #define __stdcall + #endif + #ifndef __cdecl + #define __cdecl + #endif + #ifndef __fastcall + #define __fastcall + #endif +#endif +#ifndef DL_IMPORT + #define DL_IMPORT(t) t +#endif +#ifndef DL_EXPORT + #define DL_EXPORT(t) t +#endif +#ifndef PY_LONG_LONG + #define PY_LONG_LONG LONG_LONG +#endif +#ifndef Py_HUGE_VAL + #define Py_HUGE_VAL HUGE_VAL +#endif +#ifdef PYPY_VERSION +#define CYTHON_COMPILING_IN_PYPY 1 +#define CYTHON_COMPILING_IN_CPYTHON 0 +#else +#define CYTHON_COMPILING_IN_PYPY 0 +#define CYTHON_COMPILING_IN_CPYTHON 1 +#endif +#if CYTHON_COMPILING_IN_PYPY +#define Py_OptimizeFlag 0 +#endif +#if PY_VERSION_HEX < 0x02050000 + typedef int Py_ssize_t; + #define PY_SSIZE_T_MAX INT_MAX + #define PY_SSIZE_T_MIN INT_MIN + #define PY_FORMAT_SIZE_T "" + #define CYTHON_FORMAT_SSIZE_T "" + #define PyInt_FromSsize_t(z) PyInt_FromLong(z) + #define PyInt_AsSsize_t(o) __Pyx_PyInt_As_int(o) + #define PyNumber_Index(o) ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \ + (PyErr_Format(PyExc_TypeError, \ + "expected index value, got %.200s", Py_TYPE(o)->tp_name), \ + (PyObject*)0)) + #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \ + !PyComplex_Check(o)) + #define PyIndex_Check __Pyx_PyIndex_Check + #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message) + #define __PYX_BUILD_PY_SSIZE_T "i" +#else + #define __PYX_BUILD_PY_SSIZE_T "n" + #define CYTHON_FORMAT_SSIZE_T "z" + #define __Pyx_PyIndex_Check PyIndex_Check +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt) + #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) + #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size) + #define PyVarObject_HEAD_INIT(type, size) \ + PyObject_HEAD_INIT(type) size, + #define PyType_Modified(t) + typedef struct { + void *buf; + PyObject *obj; + Py_ssize_t len; + Py_ssize_t itemsize; + int readonly; + int ndim; + char *format; + Py_ssize_t *shape; + Py_ssize_t *strides; + Py_ssize_t *suboffsets; + void *internal; + } Py_buffer; + #define PyBUF_SIMPLE 0 + #define PyBUF_WRITABLE 0x0001 + #define PyBUF_FORMAT 0x0004 + #define PyBUF_ND 0x0008 + #define PyBUF_STRIDES (0x0010 | PyBUF_ND) + #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES) + #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES) + #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES) + #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES) + #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE) + #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE) + typedef int (*getbufferproc)(PyObject *, Py_buffer *, int); + typedef void (*releasebufferproc)(PyObject *, Py_buffer *); +#endif +#if PY_MAJOR_VERSION < 3 + #define __Pyx_BUILTIN_MODULE_NAME "__builtin__" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyClass_Type +#else + #define __Pyx_BUILTIN_MODULE_NAME "builtins" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \ + PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyType_Type +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict") +#endif +#if PY_MAJOR_VERSION >= 3 + #define Py_TPFLAGS_CHECKTYPES 0 + #define Py_TPFLAGS_HAVE_INDEX 0 +#endif +#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3) + #define Py_TPFLAGS_HAVE_NEWBUFFER 0 +#endif +#if PY_VERSION_HEX < 0x02060000 + #define Py_TPFLAGS_HAVE_VERSION_TAG 0 +#endif +#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT) + #define Py_TPFLAGS_IS_ABSTRACT 0 +#endif +#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE) + #define Py_TPFLAGS_HAVE_FINALIZE 0 +#endif +#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND) + #define CYTHON_PEP393_ENABLED 1 + #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ? \ + 0 : _PyUnicode_Ready((PyObject *)(op))) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i) + #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u) + #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) + #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) +#else + #define CYTHON_PEP393_ENABLED 0 + #define __Pyx_PyUnicode_READY(op) (0) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i])) + #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE)) + #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u)) + #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i])) +#endif +#if CYTHON_COMPILING_IN_PYPY + #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b) +#else + #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \ + PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b)) +#endif +#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) +#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b) +#else + #define __Pyx_PyString_Format(a, b) PyString_Format(a, b) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBaseString_Type PyUnicode_Type + #define PyStringObject PyUnicodeObject + #define PyString_Type PyUnicode_Type + #define PyString_Check PyUnicode_Check + #define PyString_CheckExact PyUnicode_CheckExact +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PyBytesObject PyStringObject + #define PyBytes_Type PyString_Type + #define PyBytes_Check PyString_Check + #define PyBytes_CheckExact PyString_CheckExact + #define PyBytes_FromString PyString_FromString + #define PyBytes_FromStringAndSize PyString_FromStringAndSize + #define PyBytes_FromFormat PyString_FromFormat + #define PyBytes_DecodeEscape PyString_DecodeEscape + #define PyBytes_AsString PyString_AsString + #define PyBytes_AsStringAndSize PyString_AsStringAndSize + #define PyBytes_Size PyString_Size + #define PyBytes_AS_STRING PyString_AS_STRING + #define PyBytes_GET_SIZE PyString_GET_SIZE + #define PyBytes_Repr PyString_Repr + #define PyBytes_Concat PyString_Concat + #define PyBytes_ConcatAndDel PyString_ConcatAndDel +#endif +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) + #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) +#else + #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \ + PyString_Check(obj) || PyUnicode_Check(obj)) + #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj)) +#endif +#if PY_VERSION_HEX < 0x02060000 + #define PySet_Check(obj) PyObject_TypeCheck(obj, &PySet_Type) + #define PyFrozenSet_Check(obj) PyObject_TypeCheck(obj, &PyFrozenSet_Type) +#endif +#ifndef PySet_CheckExact + #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) +#endif +#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type) +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask + #define PyNumber_Int PyNumber_Long +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBoolObject PyLongObject +#endif +#if PY_VERSION_HEX < 0x030200A4 + typedef long Py_hash_t; + #define __Pyx_PyInt_FromHash_t PyInt_FromLong + #define __Pyx_PyInt_AsHash_t PyInt_AsLong +#else + #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t + #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t +#endif +#if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300) + #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value) + #define __Pyx_PySequence_DelSlice(obj, a, b) PySequence_DelSlice(obj, a, b) +#else + #define __Pyx_PySequence_GetSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), (PyObject*)0) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_GetSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object is unsliceable", (obj)->ob_type->tp_name), (PyObject*)0))) + #define __Pyx_PySequence_SetSlice(obj, a, b, value) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_SetSlice(obj, a, b, value)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice assignment", (obj)->ob_type->tp_name), -1))) + #define __Pyx_PySequence_DelSlice(obj, a, b) (unlikely(!(obj)) ? \ + (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \ + (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \ + (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1))) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),((char *)(n))) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),((char *)(n))) +#else + #define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),(n)) + #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a)) + #define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),(n)) +#endif +#if PY_VERSION_HEX < 0x02050000 + #define __Pyx_NAMESTR(n) ((char *)(n)) + #define __Pyx_DOCSTR(n) ((char *)(n)) +#else + #define __Pyx_NAMESTR(n) (n) + #define __Pyx_DOCSTR(n) (n) +#endif +#ifndef CYTHON_INLINE + #if defined(__GNUC__) + #define CYTHON_INLINE __inline__ + #elif defined(_MSC_VER) + #define CYTHON_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_INLINE inline + #else + #define CYTHON_INLINE + #endif +#endif +#ifndef CYTHON_RESTRICT + #if defined(__GNUC__) + #define CYTHON_RESTRICT __restrict__ + #elif defined(_MSC_VER) && _MSC_VER >= 1400 + #define CYTHON_RESTRICT __restrict + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_RESTRICT restrict + #else + #define CYTHON_RESTRICT + #endif +#endif +#ifdef NAN +#define __PYX_NAN() ((float) NAN) +#else +static CYTHON_INLINE float __PYX_NAN() { + /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and + a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is + a quiet NaN. */ + float value; + memset(&value, 0xFF, sizeof(value)); + return value; +} +#endif + + +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y) +#else + #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y) +#endif + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#if defined(WIN32) || defined(MS_WINDOWS) +#define _USE_MATH_DEFINES +#endif +#include +#define __PYX_HAVE__spacy__spacy +#define __PYX_HAVE_API__spacy__spacy +#include +#include "ios" +#include "new" +#include "stdexcept" +#include "typeinfo" +#include "stdint.h" +#ifdef _OPENMP +#include +#endif /* _OPENMP */ + +#ifdef PYREX_WITHOUT_ASSERTIONS +#define CYTHON_WITHOUT_ASSERTIONS +#endif + +#ifndef CYTHON_UNUSED +# if defined(__GNUC__) +# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +#endif +typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding; + const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/ + +#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0 +#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0 +#define __PYX_DEFAULT_STRING_ENCODING "" +#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString +#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#define __Pyx_fits_Py_ssize_t(v, type, is_signed) ( \ + (sizeof(type) < sizeof(Py_ssize_t)) || \ + (sizeof(type) > sizeof(Py_ssize_t) && \ + likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX) && \ + (!is_signed || likely(v > (type)PY_SSIZE_T_MIN || \ + v == (type)PY_SSIZE_T_MIN))) || \ + (sizeof(type) == sizeof(Py_ssize_t) && \ + (is_signed || likely(v < (type)PY_SSIZE_T_MAX || \ + v == (type)PY_SSIZE_T_MAX))) ) +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*); +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length); +#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s)) +#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l) +#define __Pyx_PyBytes_FromString PyBytes_FromString +#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*); +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#else + #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize +#endif +#define __Pyx_PyObject_AsSString(s) ((signed char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_FromUString(s) __Pyx_PyObject_FromString((char*)s) +#define __Pyx_PyBytes_FromUString(s) __Pyx_PyBytes_FromString((char*)s) +#define __Pyx_PyByteArray_FromUString(s) __Pyx_PyByteArray_FromString((char*)s) +#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s) +#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s) +#if PY_MAJOR_VERSION < 3 +static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) +{ + const Py_UNICODE *u_end = u; + while (*u_end++) ; + return u_end - u - 1; +} +#else +#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen +#endif +#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u)) +#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode +#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode +#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None) +#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x); +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*); +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t); +#if CYTHON_COMPILING_IN_CPYTHON +#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x)) +#else +#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x) +#endif +#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x)) +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII +static int __Pyx_sys_getdefaultencoding_not_ascii; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + PyObject* ascii_chars_u = NULL; + PyObject* ascii_chars_b = NULL; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) { + __Pyx_sys_getdefaultencoding_not_ascii = 0; + } else { + const char* default_encoding_c = PyBytes_AS_STRING(default_encoding); + char ascii_chars[128]; + int c; + for (c = 0; c < 128; c++) { + ascii_chars[c] = c; + } + __Pyx_sys_getdefaultencoding_not_ascii = 1; + ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL); + if (ascii_chars_u == NULL) goto bad; + ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL); + if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) { + PyErr_Format( + PyExc_ValueError, + "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.", + default_encoding_c); + goto bad; + } + } + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return -1; +} +#endif +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3 +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL) +#else +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL) +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT +static char* __PYX_DEFAULT_STRING_ENCODING; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys = NULL; + PyObject* default_encoding = NULL; + char* default_encoding_c; + sys = PyImport_ImportModule("sys"); + if (sys == NULL) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + if (default_encoding == NULL) goto bad; + default_encoding_c = PyBytes_AS_STRING(default_encoding); + __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c)); + strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c); + Py_DECREF(sys); + Py_DECREF(default_encoding); + return 0; +bad: + Py_XDECREF(sys); + Py_XDECREF(default_encoding); + return -1; +} +#endif +#endif + + +#ifdef __GNUC__ + /* Test for GCC > 2.95 */ + #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) + #else /* __GNUC__ > 2 ... */ + #define likely(x) (x) + #define unlikely(x) (x) + #endif /* __GNUC__ > 2 ... */ +#else /* __GNUC__ */ + #define likely(x) (x) + #define unlikely(x) (x) +#endif /* __GNUC__ */ + +static PyObject *__pyx_m; +static PyObject *__pyx_d; +static PyObject *__pyx_b; +static PyObject *__pyx_empty_tuple; +static PyObject *__pyx_empty_bytes; +static int __pyx_lineno; +static int __pyx_clineno = 0; +static const char * __pyx_cfilenm= __FILE__; +static const char *__pyx_filename; + + +static const char *__pyx_f[] = { + "spacy.pyx", + "stringsource", +}; + +/* "spacy/lexeme.pxd":4 + * + * + * ctypedef int ClusterID # <<<<<<<<<<<<<< + * ctypedef uint64_t StringHash + * + */ +typedef int __pyx_t_5spacy_6lexeme_ClusterID; + +/* "spacy/lexeme.pxd":5 + * + * ctypedef int ClusterID + * ctypedef uint64_t StringHash # <<<<<<<<<<<<<< + * + * + */ +typedef uint64_t __pyx_t_5spacy_6lexeme_StringHash; + +/*--- Type declarations ---*/ +struct __pyx_t_5spacy_6lexeme_Lexeme; + +/* "spacy/lexeme.pxd":27 + * # over the Lexeme, via: + * # for field in range(LexAttr.n): get_attr(Lexeme*, field) + * cdef enum HashFields: # <<<<<<<<<<<<<< + * sic + * lex + */ +enum __pyx_t_5spacy_6lexeme_HashFields { + __pyx_e_5spacy_6lexeme_sic, + __pyx_e_5spacy_6lexeme_lex, + __pyx_e_5spacy_6lexeme_normed, + __pyx_e_5spacy_6lexeme_cluster, + __pyx_e_5spacy_6lexeme_n +}; + +/* "spacy/lexeme.pxd":8 + * + * + * cdef struct Lexeme: # <<<<<<<<<<<<<< + * StringHash sic # Hash of the original string + * StringHash lex # Hash of the word, with punctuation and clitics split off + */ +struct __pyx_t_5spacy_6lexeme_Lexeme { + __pyx_t_5spacy_6lexeme_StringHash sic; + __pyx_t_5spacy_6lexeme_StringHash lex; + __pyx_t_5spacy_6lexeme_StringHash normed; + __pyx_t_5spacy_6lexeme_StringHash last3; + Py_UNICODE first; + double prob; + __pyx_t_5spacy_6lexeme_ClusterID cluster; + int oft_upper; + int oft_title; + struct __pyx_t_5spacy_6lexeme_Lexeme *tail; +}; +#ifndef CYTHON_REFNANNY + #define CYTHON_REFNANNY 0 +#endif +#if CYTHON_REFNANNY + typedef struct { + void (*INCREF)(void*, PyObject*, int); + void (*DECREF)(void*, PyObject*, int); + void (*GOTREF)(void*, PyObject*, int); + void (*GIVEREF)(void*, PyObject*, int); + void* (*SetupContext)(const char*, int, const char*); + void (*FinishContext)(void**); + } __Pyx_RefNannyAPIStruct; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); /*proto*/ + #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL; +#ifdef WITH_THREAD + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + if (acquire_gil) { \ + PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure(); \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + PyGILState_Release(__pyx_gilstate_save); \ + } else { \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \ + } +#else + #define __Pyx_RefNannySetupContext(name, acquire_gil) \ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__) +#endif + #define __Pyx_RefNannyFinishContext() \ + __Pyx_RefNanny->FinishContext(&__pyx_refnanny) + #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0) + #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0) + #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0) + #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0) +#else + #define __Pyx_RefNannyDeclarations + #define __Pyx_RefNannySetupContext(name, acquire_gil) + #define __Pyx_RefNannyFinishContext() + #define __Pyx_INCREF(r) Py_INCREF(r) + #define __Pyx_DECREF(r) Py_DECREF(r) + #define __Pyx_GOTREF(r) + #define __Pyx_GIVEREF(r) + #define __Pyx_XINCREF(r) Py_XINCREF(r) + #define __Pyx_XDECREF(r) Py_XDECREF(r) + #define __Pyx_XGOTREF(r) + #define __Pyx_XGIVEREF(r) +#endif /* CYTHON_REFNANNY */ +#define __Pyx_XDECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_XDECREF(tmp); \ + } while (0) +#define __Pyx_DECREF_SET(r, v) do { \ + PyObject *tmp = (PyObject *) r; \ + r = v; __Pyx_DECREF(tmp); \ + } while (0) +#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0) +#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0) + +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) { + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_getattro)) + return tp->tp_getattro(obj, attr_name); +#if PY_MAJOR_VERSION < 3 + if (likely(tp->tp_getattr)) + return tp->tp_getattr(obj, PyString_AS_STRING(attr_name)); +#endif + return PyObject_GetAttr(obj, attr_name); +} +#else +#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n) +#endif + +static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/ + +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) { + PyListObject* L = (PyListObject*) list; + Py_ssize_t len = Py_SIZE(list); + if (likely(L->allocated > len)) { + Py_INCREF(x); + PyList_SET_ITEM(list, len, x); + Py_SIZE(list) = len+1; + return 0; + } + return PyList_Append(list, x); +} +#else +#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x) +#endif + +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *); + +#ifndef __Pyx_CppExn2PyErr +#include +#include +#include +#include +static void __Pyx_CppExn2PyErr() { + try { + if (PyErr_Occurred()) + ; // let the latest Python exn pass through and ignore the current one + else + throw; + } catch (const std::bad_alloc& exn) { + PyErr_SetString(PyExc_MemoryError, exn.what()); + } catch (const std::bad_cast& exn) { + PyErr_SetString(PyExc_TypeError, exn.what()); + } catch (const std::domain_error& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::invalid_argument& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::ios_base::failure& exn) { + PyErr_SetString(PyExc_IOError, exn.what()); + } catch (const std::out_of_range& exn) { + PyErr_SetString(PyExc_IndexError, exn.what()); + } catch (const std::overflow_error& exn) { + PyErr_SetString(PyExc_OverflowError, exn.what()); + } catch (const std::range_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::underflow_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::exception& exn) { + PyErr_SetString(PyExc_RuntimeError, exn.what()); + } + catch (...) + { + PyErr_SetString(PyExc_RuntimeError, "Unknown exception"); + } +} +#endif + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); + +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *); + +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); + +static int __Pyx_check_binary_version(void); + +static int __Pyx_ExportFunction(const char *name, void (*f)(void), const char *sig); /*proto*/ + +typedef struct { + int code_line; + PyCodeObject* code_object; +} __Pyx_CodeObjectCacheEntry; +struct __Pyx_CodeObjectCache { + int count; + int max_count; + __Pyx_CodeObjectCacheEntry* entries; +}; +static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL}; +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line); +static PyCodeObject *__pyx_find_code_object(int code_line); +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object); + +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename); /*proto*/ + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ + + +/* Module declarations from 'libcpp.vector' */ + +/* Module declarations from 'libc.stdint' */ + +/* Module declarations from 'spacy.lexeme' */ + +/* Module declarations from 'spacy.spacy' */ +static std::vector __pyx_f_5spacy_5spacy_expand_chunk(size_t, int __pyx_skip_dispatch); /*proto*/ +static PyObject *__pyx_convert_vector_to_py_size_t(const std::vector &); /*proto*/ +#define __Pyx_MODULE_NAME "spacy.spacy" +int __pyx_module_is_main_spacy__spacy = 0; + +/* Implementation of 'spacy.spacy' */ +static PyObject *__pyx_builtin_range; +static PyObject *__pyx_pf_5spacy_5spacy_expand_chunk(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_addr); /* proto */ +static char __pyx_k_main[] = "__main__"; +static char __pyx_k_test[] = "__test__"; +static char __pyx_k_range[] = "range"; +static PyObject *__pyx_n_s_main; +static PyObject *__pyx_n_s_range; +static PyObject *__pyx_n_s_test; + +/* "spacy/spacy.pyx":5 + * + * + * cpdef vector[size_t] expand_chunk(size_t addr) except *: # <<<<<<<<<<<<<< + * cdef vector[size_t] tokens = vector[size_t]() + * word = addr + */ + +static PyObject *__pyx_pw_5spacy_5spacy_1expand_chunk(PyObject *__pyx_self, PyObject *__pyx_arg_addr); /*proto*/ +static std::vector __pyx_f_5spacy_5spacy_expand_chunk(size_t __pyx_v_addr, CYTHON_UNUSED int __pyx_skip_dispatch) { + std::vector __pyx_v_tokens; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_word; + std::vector __pyx_r; + __Pyx_RefNannyDeclarations + std::vector __pyx_t_1; + int __pyx_t_2; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_3; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("expand_chunk", 0); + + /* "spacy/spacy.pyx":6 + * + * cpdef vector[size_t] expand_chunk(size_t addr) except *: + * cdef vector[size_t] tokens = vector[size_t]() # <<<<<<<<<<<<<< + * word = addr + * while word is not NULL: + */ + try { + __pyx_t_1 = std::vector(); + } catch(...) { + __Pyx_CppExn2PyErr(); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_v_tokens = __pyx_t_1; + + /* "spacy/spacy.pyx":7 + * cpdef vector[size_t] expand_chunk(size_t addr) except *: + * cdef vector[size_t] tokens = vector[size_t]() + * word = addr # <<<<<<<<<<<<<< + * while word is not NULL: + * tokens.push_back(word) + */ + __pyx_v_word = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_v_addr); + + /* "spacy/spacy.pyx":8 + * cdef vector[size_t] tokens = vector[size_t]() + * word = addr + * while word is not NULL: # <<<<<<<<<<<<<< + * tokens.push_back(word) + * word = word.tail + */ + while (1) { + __pyx_t_2 = ((__pyx_v_word != NULL) != 0); + if (!__pyx_t_2) break; + + /* "spacy/spacy.pyx":9 + * word = addr + * while word is not NULL: + * tokens.push_back(word) # <<<<<<<<<<<<<< + * word = word.tail + * return tokens + */ + __pyx_v_tokens.push_back(((size_t)__pyx_v_word)); + + /* "spacy/spacy.pyx":10 + * while word is not NULL: + * tokens.push_back(word) + * word = word.tail # <<<<<<<<<<<<<< + * return tokens + * + */ + __pyx_t_3 = __pyx_v_word->tail; + __pyx_v_word = __pyx_t_3; + } + + /* "spacy/spacy.pyx":11 + * tokens.push_back(word) + * word = word.tail + * return tokens # <<<<<<<<<<<<<< + * + * + */ + __pyx_r = __pyx_v_tokens; + goto __pyx_L0; + + /* "spacy/spacy.pyx":5 + * + * + * cpdef vector[size_t] expand_chunk(size_t addr) except *: # <<<<<<<<<<<<<< + * cdef vector[size_t] tokens = vector[size_t]() + * word = addr + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_AddTraceback("spacy.spacy.expand_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_5spacy_1expand_chunk(PyObject *__pyx_self, PyObject *__pyx_arg_addr); /*proto*/ +static PyObject *__pyx_pw_5spacy_5spacy_1expand_chunk(PyObject *__pyx_self, PyObject *__pyx_arg_addr) { + size_t __pyx_v_addr; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("expand_chunk (wrapper)", 0); + assert(__pyx_arg_addr); { + __pyx_v_addr = __Pyx_PyInt_As_size_t(__pyx_arg_addr); if (unlikely((__pyx_v_addr == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L3_error:; + __Pyx_AddTraceback("spacy.spacy.expand_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5spacy_5spacy_expand_chunk(__pyx_self, ((size_t)__pyx_v_addr)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_5spacy_expand_chunk(CYTHON_UNUSED PyObject *__pyx_self, size_t __pyx_v_addr) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + std::vector __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("expand_chunk", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_5spacy_expand_chunk(__pyx_v_addr, 0); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __pyx_convert_vector_to_py_size_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.spacy.expand_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/spacy.pyx":62 + * + * + * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + */ + +static CYTHON_INLINE int __pyx_f_5spacy_5spacy_is_whitespace(Py_UNICODE __pyx_v_c) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_whitespace", 0); + + /* "spacy/spacy.pyx":69 + * elif c == u'\n': + * return True + * elif c == u'\t': # <<<<<<<<<<<<<< + * return True + * else: + */ + switch (__pyx_v_c) { + + /* "spacy/spacy.pyx":65 + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + * if c == u' ': # <<<<<<<<<<<<<< + * return True + * elif c == u'\n': + */ + case 32: + + /* "spacy/spacy.pyx":66 + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + * if c == u' ': + * return True # <<<<<<<<<<<<<< + * elif c == u'\n': + * return True + */ + __pyx_r = 1; + goto __pyx_L0; + break; + + /* "spacy/spacy.pyx":67 + * if c == u' ': + * return True + * elif c == u'\n': # <<<<<<<<<<<<<< + * return True + * elif c == u'\t': + */ + case 10: + + /* "spacy/spacy.pyx":68 + * return True + * elif c == u'\n': + * return True # <<<<<<<<<<<<<< + * elif c == u'\t': + * return True + */ + __pyx_r = 1; + goto __pyx_L0; + break; + + /* "spacy/spacy.pyx":69 + * elif c == u'\n': + * return True + * elif c == u'\t': # <<<<<<<<<<<<<< + * return True + * else: + */ + case 9: + + /* "spacy/spacy.pyx":70 + * return True + * elif c == u'\t': + * return True # <<<<<<<<<<<<<< + * else: + * return False + */ + __pyx_r = 1; + goto __pyx_L0; + break; + default: + + /* "spacy/spacy.pyx":72 + * return True + * else: + * return False # <<<<<<<<<<<<<< + */ + __pyx_r = 0; + goto __pyx_L0; + break; + } + + /* "spacy/spacy.pyx":62 + * + * + * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py_size_t") + * cdef object __pyx_convert_vector_to_py_size_t(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ + +static PyObject *__pyx_convert_vector_to_py_size_t(const std::vector &__pyx_v_v) { + size_t __pyx_v_i; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + size_t __pyx_t_2; + size_t __pyx_t_3; + PyObject *__pyx_t_4 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__pyx_convert_vector_to_py_size_t", 0); + + /* "vector.to_py":64 + * @cname("__pyx_convert_vector_to_py_size_t") + * cdef object __pyx_convert_vector_to_py_size_t(vector[X]& v): + * return [X_to_py(v[i]) for i in range(v.size())] # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __pyx_v_v.size(); + for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { + __pyx_v_i = __pyx_t_3; + __pyx_t_4 = __Pyx_PyInt_FromSize_t((__pyx_v_v[__pyx_v_i])); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_4); + if (unlikely(__Pyx_ListComp_Append(__pyx_t_1, (PyObject*)__pyx_t_4))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + } + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py_size_t") + * cdef object __pyx_convert_vector_to_py_size_t(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_AddTraceback("vector.to_py.__pyx_convert_vector_to_py_size_t", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyMethodDef __pyx_methods[] = { + {__Pyx_NAMESTR("expand_chunk"), (PyCFunction)__pyx_pw_5spacy_5spacy_1expand_chunk, METH_O, __Pyx_DOCSTR(0)}, + {0, 0, 0, 0} +}; + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef __pyx_moduledef = { + #if PY_VERSION_HEX < 0x03020000 + { PyObject_HEAD_INIT(NULL) NULL, 0, NULL }, + #else + PyModuleDef_HEAD_INIT, + #endif + __Pyx_NAMESTR("spacy"), + 0, /* m_doc */ + -1, /* m_size */ + __pyx_methods /* m_methods */, + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; +#endif + +static __Pyx_StringTabEntry __pyx_string_tab[] = { + {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, + {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1}, + {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 0, 0} +}; +static int __Pyx_InitCachedBuiltins(void) { + __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + return 0; + __pyx_L1_error:; + return -1; +} + +static int __Pyx_InitCachedConstants(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_InitGlobals(void) { + if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + return 0; + __pyx_L1_error:; + return -1; +} + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC initspacy(void); /*proto*/ +PyMODINIT_FUNC initspacy(void) +#else +PyMODINIT_FUNC PyInit_spacy(void); /*proto*/ +PyMODINIT_FUNC PyInit_spacy(void) +#endif +{ + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannyDeclarations + #if CYTHON_REFNANNY + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); + if (!__Pyx_RefNanny) { + PyErr_Clear(); + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); + if (!__Pyx_RefNanny) + Py_FatalError("failed to import 'refnanny' module"); + } + #endif + __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_spacy(void)", 0); + if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #ifdef __Pyx_CyFunction_USED + if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_FusedFunction_USED + if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + #ifdef __Pyx_Generator_USED + if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + /*--- Library function declarations ---*/ + /*--- Threads initialization code ---*/ + #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS + #ifdef WITH_THREAD /* Python build with threading support? */ + PyEval_InitThreads(); + #endif + #endif + /*--- Module creation code ---*/ + #if PY_MAJOR_VERSION < 3 + __pyx_m = Py_InitModule4(__Pyx_NAMESTR("spacy"), __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); + #else + __pyx_m = PyModule_Create(&__pyx_moduledef); + #endif + if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + Py_INCREF(__pyx_d); + __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if CYTHON_COMPILING_IN_PYPY + Py_INCREF(__pyx_b); + #endif + if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + /*--- Initialize various global constants etc. ---*/ + if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT) + if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + if (__pyx_module_is_main_spacy__spacy) { + if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + } + #if PY_MAJOR_VERSION >= 3 + { + PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!PyDict_GetItemString(modules, "spacy.spacy")) { + if (unlikely(PyDict_SetItemString(modules, "spacy.spacy", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + /*--- Builtin init code ---*/ + if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Constants init code ---*/ + if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Global init code ---*/ + /*--- Variable export code ---*/ + /*--- Function export code ---*/ + if (__Pyx_ExportFunction("expand_chunk", (void (*)(void))__pyx_f_5spacy_5spacy_expand_chunk, "std::vector (size_t, int __pyx_skip_dispatch)") < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + /*--- Type init code ---*/ + /*--- Type import code ---*/ + /*--- Variable import code ---*/ + /*--- Function import code ---*/ + /*--- Execution code ---*/ + + /* "spacy/spacy.pyx":1 + * from __future__ import unicode_literals # <<<<<<<<<<<<<< + * from spacy.lexeme cimport Lexeme + * + */ + __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py_size_t") + * cdef object __pyx_convert_vector_to_py_size_t(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + if (__pyx_m) { + __Pyx_AddTraceback("init spacy.spacy", __pyx_clineno, __pyx_lineno, __pyx_filename); + Py_DECREF(__pyx_m); __pyx_m = 0; + } else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ImportError, "init spacy.spacy"); + } + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + #if PY_MAJOR_VERSION < 3 + return; + #else + return __pyx_m; + #endif +} + +/* Runtime support code */ +#if CYTHON_REFNANNY +static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) { + PyObject *m = NULL, *p = NULL; + void *r = NULL; + m = PyImport_ImportModule((char *)modname); + if (!m) goto end; + p = PyObject_GetAttrString(m, (char *)"RefNannyAPI"); + if (!p) goto end; + r = PyLong_AsVoidPtr(p); +end: + Py_XDECREF(p); + Py_XDECREF(m); + return (__Pyx_RefNannyAPIStruct *)r; +} +#endif /* CYTHON_REFNANNY */ + +static PyObject *__Pyx_GetBuiltinName(PyObject *name) { + PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name); + if (unlikely(!result)) { + PyErr_Format(PyExc_NameError, +#if PY_MAJOR_VERSION >= 3 + "name '%U' is not defined", name); +#else + "name '%.200s' is not defined", PyString_AS_STRING(name)); +#endif + } + return result; +} + +#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func) \ + { \ + func_type value = func(x); \ + if (sizeof(target_type) < sizeof(func_type)) { \ + if (unlikely(value != (func_type) (target_type) value)) { \ + func_type zero = 0; \ + PyErr_SetString(PyExc_OverflowError, \ + (is_unsigned && unlikely(value < zero)) ? \ + "can't convert negative value to " #target_type : \ + "value too large to convert to " #target_type); \ + return (target_type) -1; \ + } \ + } \ + return (target_type) value; \ + } + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) { + const size_t neg_one = (size_t) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(size_t) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + return (size_t) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to size_t"); + return (size_t) -1; + } + if (sizeof(size_t) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(size_t) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(size_t, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(size_t)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(size_t) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(size_t) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(size_t) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(size_t, long, PyLong_AsLong) + } else if (sizeof(size_t) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(size_t, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + size_t val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (size_t) -1; + } + } else { + size_t val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (size_t) -1; + val = __Pyx_PyInt_As_size_t(tmp); + Py_DECREF(tmp); + return val; + } +} + +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(long) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); + } else if (sizeof(long) <= sizeof(unsigned long long)) { + return PyLong_FromUnsignedLongLong((unsigned long long) value); + } + } else { + if (sizeof(long) <= sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(long long)) { + return PyLong_FromLongLong((long long) value); + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(long), + little, !is_unsigned); + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + const long neg_one = (long) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(long) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + return (long) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; + } + if (sizeof(long) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(long) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(long)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(long) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(long) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(long) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong) + } else if (sizeof(long) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + long val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (long) -1; + } + } else { + long val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (long) -1; + val = __Pyx_PyInt_As_long(tmp); + Py_DECREF(tmp); + return val; + } +} + +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + const int neg_one = (int) -1, const_zero = 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + return (int) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return (int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (unlikely(Py_SIZE(x) < 0)) { + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; + } + if (sizeof(int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong) + } else if (sizeof(int) <= sizeof(unsigned long long)) { + __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong) + } + } else { +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + if (sizeof(digit) <= sizeof(int)) { + switch (Py_SIZE(x)) { + case 0: return 0; + case 1: return +(int) ((PyLongObject*)x)->ob_digit[0]; + case -1: return -(int) ((PyLongObject*)x)->ob_digit[0]; + } + } + #endif +#endif + if (sizeof(int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong) + } else if (sizeof(int) <= sizeof(long long)) { + __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong) + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + int val; + PyObject *v = __Pyx_PyNumber_Int(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (int) -1; + } + } else { + int val; + PyObject *tmp = __Pyx_PyNumber_Int(x); + if (!tmp) return (int) -1; + val = __Pyx_PyInt_As_int(tmp); + Py_DECREF(tmp); + return val; + } +} + +static int __Pyx_check_binary_version(void) { + char ctversion[4], rtversion[4]; + PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); + PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); + if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) { + char message[200]; + PyOS_snprintf(message, sizeof(message), + "compiletime version %s of module '%.100s' " + "does not match runtime version %s", + ctversion, __Pyx_MODULE_NAME, rtversion); + #if PY_VERSION_HEX < 0x02050000 + return PyErr_Warn(NULL, message); + #else + return PyErr_WarnEx(NULL, message, 1); + #endif + } + return 0; +} + +static int __Pyx_ExportFunction(const char *name, void (*f)(void), const char *sig) { + PyObject *d = 0; + PyObject *cobj = 0; + union { + void (*fp)(void); + void *p; + } tmp; + d = PyObject_GetAttrString(__pyx_m, (char *)"__pyx_capi__"); + if (!d) { + PyErr_Clear(); + d = PyDict_New(); + if (!d) + goto bad; + Py_INCREF(d); + if (PyModule_AddObject(__pyx_m, (char *)"__pyx_capi__", d) < 0) + goto bad; + } + tmp.fp = f; +#if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION==3&&PY_MINOR_VERSION==0) + cobj = PyCapsule_New(tmp.p, sig, 0); +#else + cobj = PyCObject_FromVoidPtrAndDesc(tmp.p, (void *)sig, 0); +#endif + if (!cobj) + goto bad; + if (PyDict_SetItemString(d, name, cobj) < 0) + goto bad; + Py_DECREF(cobj); + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(cobj); + Py_XDECREF(d); + return -1; +} + +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + int start = 0, mid = 0, end = count - 1; + if (end >= 0 && code_line > entries[end].code_line) { + return count; + } + while (start < end) { + mid = (start + end) / 2; + if (code_line < entries[mid].code_line) { + end = mid; + } else if (code_line > entries[mid].code_line) { + start = mid + 1; + } else { + return mid; + } + } + if (code_line <= entries[mid].code_line) { + return mid; + } else { + return mid + 1; + } +} +static PyCodeObject *__pyx_find_code_object(int code_line) { + PyCodeObject* code_object; + int pos; + if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) { + return NULL; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) { + return NULL; + } + code_object = __pyx_code_cache.entries[pos].code_object; + Py_INCREF(code_object); + return code_object; +} +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { + int pos, i; + __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries; + if (unlikely(!code_line)) { + return; + } + if (unlikely(!entries)) { + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry)); + if (likely(entries)) { + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = 64; + __pyx_code_cache.count = 1; + entries[0].code_line = code_line; + entries[0].code_object = code_object; + Py_INCREF(code_object); + } + return; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) { + PyCodeObject* tmp = entries[pos].code_object; + entries[pos].code_object = code_object; + Py_DECREF(tmp); + return; + } + if (__pyx_code_cache.count == __pyx_code_cache.max_count) { + int new_max = __pyx_code_cache.max_count + 64; + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( + __pyx_code_cache.entries, new_max*sizeof(__Pyx_CodeObjectCacheEntry)); + if (unlikely(!entries)) { + return; + } + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = new_max; + } + for (i=__pyx_code_cache.count; i>pos; i--) { + entries[i] = entries[i-1]; + } + entries[pos].code_line = code_line; + entries[pos].code_object = code_object; + __pyx_code_cache.count++; + Py_INCREF(code_object); +} + +#include "compile.h" +#include "frameobject.h" +#include "traceback.h" +static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( + const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_srcfile = 0; + PyObject *py_funcname = 0; + #if PY_MAJOR_VERSION < 3 + py_srcfile = PyString_FromString(filename); + #else + py_srcfile = PyUnicode_FromString(filename); + #endif + if (!py_srcfile) goto bad; + if (c_line) { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #else + py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #endif + } + else { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromString(funcname); + #else + py_funcname = PyUnicode_FromString(funcname); + #endif + } + if (!py_funcname) goto bad; + py_code = __Pyx_PyCode_New( + 0, /*int argcount,*/ + 0, /*int kwonlyargcount,*/ + 0, /*int nlocals,*/ + 0, /*int stacksize,*/ + 0, /*int flags,*/ + __pyx_empty_bytes, /*PyObject *code,*/ + __pyx_empty_tuple, /*PyObject *consts,*/ + __pyx_empty_tuple, /*PyObject *names,*/ + __pyx_empty_tuple, /*PyObject *varnames,*/ + __pyx_empty_tuple, /*PyObject *freevars,*/ + __pyx_empty_tuple, /*PyObject *cellvars,*/ + py_srcfile, /*PyObject *filename,*/ + py_funcname, /*PyObject *name,*/ + py_line, /*int firstlineno,*/ + __pyx_empty_bytes /*PyObject *lnotab*/ + ); + Py_DECREF(py_srcfile); + Py_DECREF(py_funcname); + return py_code; +bad: + Py_XDECREF(py_srcfile); + Py_XDECREF(py_funcname); + return NULL; +} +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_globals = 0; + PyFrameObject *py_frame = 0; + py_code = __pyx_find_code_object(c_line ? c_line : py_line); + if (!py_code) { + py_code = __Pyx_CreateCodeObjectForTraceback( + funcname, c_line, py_line, filename); + if (!py_code) goto bad; + __pyx_insert_code_object(c_line ? c_line : py_line, py_code); + } + py_globals = PyModule_GetDict(__pyx_m); + if (!py_globals) goto bad; + py_frame = PyFrame_New( + PyThreadState_GET(), /*PyThreadState *tstate,*/ + py_code, /*PyCodeObject *code,*/ + py_globals, /*PyObject *globals,*/ + 0 /*PyObject *locals*/ + ); + if (!py_frame) goto bad; + py_frame->f_lineno = py_line; + PyTraceBack_Here(py_frame); +bad: + Py_XDECREF(py_code); + Py_XDECREF(py_frame); +} + +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + while (t->p) { + #if PY_MAJOR_VERSION < 3 + if (t->is_unicode) { + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); + } else if (t->intern) { + *t->p = PyString_InternFromString(t->s); + } else { + *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + } + #else /* Python 3+ has unicode identifiers */ + if (t->is_unicode | t->is_str) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { + *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); + } else { + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + } + } else { + *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); + } + #endif + if (!*t->p) + return -1; + ++t; + } + return 0; +} + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) { + return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str)); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) { + Py_ssize_t ignore; + return __Pyx_PyObject_AsStringAndSize(o, &ignore); +} +static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) { +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT + if ( +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + __Pyx_sys_getdefaultencoding_not_ascii && +#endif + PyUnicode_Check(o)) { +#if PY_VERSION_HEX < 0x03030000 + char* defenc_c; + PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL); + if (!defenc) return NULL; + defenc_c = PyBytes_AS_STRING(defenc); +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + { + char* end = defenc_c + PyBytes_GET_SIZE(defenc); + char* c; + for (c = defenc_c; c < end; c++) { + if ((unsigned char) (*c) >= 128) { + PyUnicode_AsASCIIString(o); + return NULL; + } + } + } +#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/ + *length = PyBytes_GET_SIZE(defenc); + return defenc_c; +#else /* PY_VERSION_HEX < 0x03030000 */ + if (PyUnicode_READY(o) == -1) return NULL; +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + if (PyUnicode_IS_ASCII(o)) { + *length = PyUnicode_GET_DATA_SIZE(o); + return PyUnicode_AsUTF8(o); + } else { + PyUnicode_AsASCIIString(o); + return NULL; + } +#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ + return PyUnicode_AsUTF8AndSize(o, length); +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */ +#endif /* PY_VERSION_HEX < 0x03030000 */ + } else +#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */ +#if !CYTHON_COMPILING_IN_PYPY +#if PY_VERSION_HEX >= 0x02060000 + if (PyByteArray_Check(o)) { + *length = PyByteArray_GET_SIZE(o); + return PyByteArray_AS_STRING(o); + } else +#endif +#endif + { + char* result; + int r = PyBytes_AsStringAndSize(o, &result, length); + if (unlikely(r < 0)) { + return NULL; + } else { + return result; + } + } +} +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { + int is_true = x == Py_True; + if (is_true | (x == Py_False) | (x == Py_None)) return is_true; + else return PyObject_IsTrue(x); +} +static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) { + PyNumberMethods *m; + const char *name = NULL; + PyObject *res = NULL; +#if PY_MAJOR_VERSION < 3 + if (PyInt_Check(x) || PyLong_Check(x)) +#else + if (PyLong_Check(x)) +#endif + return Py_INCREF(x), x; + m = Py_TYPE(x)->tp_as_number; +#if PY_MAJOR_VERSION < 3 + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Int(x); + } + else if (m && m->nb_long) { + name = "long"; + res = PyNumber_Long(x); + } +#else + if (m && m->nb_int) { + name = "int"; + res = PyNumber_Long(x); + } +#endif + if (res) { +#if PY_MAJOR_VERSION < 3 + if (!PyInt_Check(res) && !PyLong_Check(res)) { +#else + if (!PyLong_Check(res)) { +#endif + PyErr_Format(PyExc_TypeError, + "__%.4s__ returned non-%.4s (type %.200s)", + name, name, Py_TYPE(res)->tp_name); + Py_DECREF(res); + return NULL; + } + } + else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, + "an integer is required"); + } + return res; +} +#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #endif +#endif +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) { + Py_ssize_t ival; + PyObject *x; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_CheckExact(b))) + return PyInt_AS_LONG(b); +#endif + if (likely(PyLong_CheckExact(b))) { + #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 + #if CYTHON_USE_PYLONG_INTERNALS + switch (Py_SIZE(b)) { + case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0]; + case 0: return 0; + case 1: return ((PyLongObject*)b)->ob_digit[0]; + } + #endif + #endif + #if PY_VERSION_HEX < 0x02060000 + return PyInt_AsSsize_t(b); + #else + return PyLong_AsSsize_t(b); + #endif + } + x = PyNumber_Index(b); + if (!x) return -1; + ival = PyInt_AsSsize_t(x); + Py_DECREF(x); + return ival; +} +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) { +#if PY_VERSION_HEX < 0x02050000 + if (ival <= LONG_MAX) + return PyInt_FromLong((long)ival); + else { + unsigned char *bytes = (unsigned char *) &ival; + int one = 1; int little = (int)*(unsigned char*)&one; + return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0); + } +#else + return PyInt_FromSize_t(ival); +#endif +} + + +#endif /* Py_PYTHON_H */ diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd new file mode 100644 index 000000000..8e93050dd --- /dev/null +++ b/spacy/spacy.pxd @@ -0,0 +1,5 @@ +from libcpp.vector cimport vector +from spacy.lexeme cimport Lexeme + + +cpdef vector[size_t] expand_chunk(size_t addr) except * diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx new file mode 100644 index 000000000..43b376767 --- /dev/null +++ b/spacy/spacy.pyx @@ -0,0 +1,72 @@ +from __future__ import unicode_literals +from spacy.lexeme cimport Lexeme + + +cpdef vector[size_t] expand_chunk(size_t addr) except *: + cdef vector[size_t] tokens = vector[size_t]() + word = addr + while word is not NULL: + tokens.push_back(word) + word = word.tail + return tokens + + +""" +cpdef vector[size_t] ids_from_text(unicode text) except *: + cdef size_t length = len(text) + cdef Py_UNICODE* characters = text + + cdef size_t i + cdef Py_UNICODE c + + cdef vector[size_t] tokens = vector[size_t]() + cdef unicode current = u'' + cdef Lexeme* token + cdef int alnum_end = -1 + cdef size_t alnum_start = 0 + cdef bint seen_alnum = False + for i in range(length): + c = characters[i] + if is_whitespace(c): + token = lookup(current) + tokens.push_back(token) + clitic = 0 + while token.clitics[clitic]: + tokens.push_back(token.clitics[clitic]) + clitic += 1 + current = u'' + alnum_start = 0 + alnum_end = -1 + seen_alnum = False + else: + if not seen_alnum and c.isalnum(): + alnum_start = i + seen_alnum = True + elif seen_alnum and alnum_end == -1 and not c.isalnum(): + alnum_end = i + current += c + if current: + token = lookup(current) + tokens.push_back(token) + clitic = 0 + while token.clitics[clitic]: + tokens.push_back(token.clitics[clitic]) + clitic += 1 + return tokens +""" + +#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *: +# pass + + +cdef inline bint is_whitespace(Py_UNICODE c): + # TODO: Support other unicode spaces + # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + if c == u' ': + return True + elif c == u'\n': + return True + elif c == u'\t': + return True + else: + return False diff --git a/spacy/util.py b/spacy/util.py new file mode 100644 index 000000000..7cb50b82b --- /dev/null +++ b/spacy/util.py @@ -0,0 +1,75 @@ +def utf8open(loc, mode='r'): + return codecs.open(loc, mode, 'utf8') + + +def load_case_stats(data_dir): + case_loc = path.join(data_dir, 'english.case') + case_stats = {} + with utf8open(case_loc) as cases_file: + for line in cases_file: + word, upper, title = line.split() + case_stats[word] = (float(upper), float(title)) + return case_stats + + +def load_clitics(data_dir): + clitics_loc = path.join(data_dir, 'clitics.txt') + entries = [] + seen = set() + with utf8open(clitics_loc) as clitics_file: + for line in clitics_file: + line = line.strip() + if line.startswith('#'): + continue + if not line: + continue + clitics = line.split() + word = clitics.pop(0) + norm_form = clitics.pop(0) + assert word not in seen, word + seen.add(word) + entries.append((word, norm_form, clitics)) + return entries + + +""" + def load_browns(self, data_dir): + cdef Lexeme* w + case_stats = load_case_stats(data_dir) + brown_loc = path.join(data_dir, 'bllip-clusters') + assert path.exists(brown_loc) + cdef size_t start + cdef int end + with utf8open(brown_loc) as browns_file: + for i, line in enumerate(browns_file): + cluster_str, word, freq_str = line.split() + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See redshift._parse_features.pyx + cluster = int(cluster_str[::-1], 2) + upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0)) + start = 0 + end = -1 + find_slice(&start, &end, word) + print "Load", repr(word), start, end + w = init_word(word, start, end, cluster, + upper_pc, title_pc, int(freq_str)) + self.words[_hash_str(word)] = w + self.strings[w] = word + + def load_clitics(self, data_dir): + cdef unicode orig_str + cdef unicode clitic + for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir): + w = init_clitic(orig_str, self.lookup_slice(norm_form, 0, -1)) + self.words[w.orig] = w + self.strings[w] = orig_str + assert len(clitic_strs) < MAX_CLITICS + assert clitic_strs + for i, clitic in enumerate(clitic_strs): + # If we write punctuation here, assume we want to keep it, + # so tell it the slice boundaries (the full string) + w.clitics[i] = self.lookup_slice(clitic, 0, -1) + # Ensure we null terminate + w.clitics[i+1] = 0 +""" + diff --git a/tests/.test_tokenizer.py.swo b/tests/.test_tokenizer.py.swo new file mode 100644 index 000000000..7b7be95fa Binary files /dev/null and b/tests/.test_tokenizer.py.swo differ diff --git a/tests/my_test.py b/tests/my_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py new file mode 100644 index 000000000..af87cf919 --- /dev/null +++ b/tests/test_post_punct.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from spacy import lex_of +from spacy.spacy import expand_chunk +from spacy.en import lookup +from spacy.en import unhash + +import pytest + + +@pytest.fixture +def close_puncts(): + return [')', ']', '}', '*'] + + +def test_close(close_puncts): + word_str = 'Hello' + for p in close_puncts: + string = word_str + p + token = lookup(string) + assert unhash(lex_of(token)) == word_str + tokens = expand_chunk(token) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == word_str + assert unhash(lex_of(tokens[1])) == p + + +def test_two_different_close(close_puncts): + word_str = 'Hello' + for p in close_puncts: + string = word_str + p + "'" + token = lookup(string) + assert unhash(lex_of(token)) == word_str + tokens = expand_chunk(token) + assert len(tokens) == 3 + assert unhash(lex_of(tokens[0])) == word_str + assert unhash(lex_of(tokens[1])) == p + assert unhash(lex_of(tokens[2])) == "'" + + +def test_three_same_close(close_puncts): + word_str = 'Hello' + for p in close_puncts: + string = word_str + p + p + p + tokens = expand_chunk(lookup(string)) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == word_str + assert unhash(lex_of(tokens[1])) == p + p + p diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py new file mode 100644 index 000000000..a085ea6c9 --- /dev/null +++ b/tests/test_pre_punct.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +from spacy import lex_of +from spacy.spacy import expand_chunk +from spacy.en import lookup +from spacy.en import unhash + +import pytest + + +@pytest.fixture +def open_puncts(): + return ['(', '[', '{', '*'] + + +def test_open(open_puncts): + word_str = 'Hello' + for p in open_puncts: + string = p + word_str + token = lookup(string) + assert unhash(lex_of(token)) == p + tokens = expand_chunk(token) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == p + assert unhash(lex_of(tokens[1])) == word_str + + +def test_two_different_open(open_puncts): + word_str = 'Hello' + for p in open_puncts: + string = p + "`" + word_str + token = lookup(string) + assert unhash(lex_of(token)) == p + tokens = expand_chunk(token) + assert len(tokens) == 3 + assert unhash(lex_of(tokens[0])) == p + assert unhash(lex_of(tokens[1])) == "`" + assert unhash(lex_of(tokens[2])) == word_str + + +def test_three_same_open(open_puncts): + word_str = 'Hello' + for p in open_puncts: + string = p + p + p + word_str + token = lookup(string) + assert unhash(lex_of(token)) == p + p + p + tokens = expand_chunk(token) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == p + p + p + assert unhash(lex_of(tokens[1])) == word_str diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py new file mode 100644 index 000000000..bef9cc83a --- /dev/null +++ b/tests/test_surround_punct.py @@ -0,0 +1,39 @@ +from __future__ import unicode_literals + +from spacy import lex_of, sic_of +from spacy.spacy import expand_chunk +from spacy.en import lookup +from spacy.en import unhash + +import pytest + + +@pytest.fixture +def paired_puncts(): + return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +def test_token(paired_puncts): + word_str = 'Hello' + for open_, close_ in paired_puncts: + string = open_ + word_str + close_ + tokens = expand_chunk(lookup(string)) + assert len(tokens) == 3 + assert unhash(lex_of(tokens[0])) == open_ + assert unhash(lex_of(tokens[1])) == word_str + assert unhash(lex_of(tokens[2])) == close_ + assert unhash(sic_of(tokens[0])) == string + + +def test_two_different(paired_puncts): + word_str = 'Hello' + for open_, close_ in paired_puncts: + string = "`" + open_ + word_str + close_ + "'" + tokens = expand_chunk(lookup(string)) + assert len(tokens) == 5 + assert unhash(lex_of(tokens[0])) == "`" + assert unhash(lex_of(tokens[1])) == open_ + assert unhash(lex_of(tokens[2])) == word_str + assert unhash(lex_of(tokens[2])) == word_str + assert unhash(lex_of(tokens[3])) == close_ + assert unhash(lex_of(tokens[4])) == "'" diff --git a/tests/test_vocab.py b/tests/test_vocab.py new file mode 100644 index 000000000..9987a561b --- /dev/null +++ b/tests/test_vocab.py @@ -0,0 +1,30 @@ +from __future__ import unicode_literals + +from spacy import lex_of +from spacy.en import lookup +from spacy.en import unhash + + +def test_neq(): + addr = lookup('Hello') + assert lookup('bye') != addr + + +def test_eq(): + addr = lookup('Hello') + assert lookup('Hello') == addr + + +def test_round_trip(): + hello = lookup('Hello') + assert unhash(lex_of(hello)) == 'Hello' + + +def test_case_neq(): + addr = lookup('Hello') + assert lookup('hello') != addr + + +def test_punct_neq(): + addr = lookup('Hello') + assert lookup('Hello,') != addr