From a62c38e1efbdf83f46bf58120d037bd3f70fb3e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 01:15:59 +0200 Subject: [PATCH] * Working tokenization. en doesn't match PTB perfectly. Need to reorganize before adding more schemes. --- spacy/en.cpp | 1421 +++++++++++++++++++++++++--------- spacy/en.pyx | 59 +- spacy/lexeme.cpp | 2 +- spacy/spacy.cpp | 109 +-- spacy/spacy.pyx | 59 -- tests/.test_tokenizer.py.swo | Bin 12288 -> 12288 bytes tests/sun.tokens | 4 + tests/sun.txt | 4 + tests/tokenizer.sed | 82 ++ 9 files changed, 1199 insertions(+), 541 deletions(-) create mode 100644 tests/sun.tokens create mode 100644 tests/sun.txt create mode 100644 tests/tokenizer.sed diff --git a/spacy/en.cpp b/spacy/en.cpp index 3eadb456e..6422961a7 100644 --- a/spacy/en.cpp +++ b/spacy/en.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 01:14:44 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS @@ -538,6 +538,7 @@ static const char *__pyx_filename; static const char *__pyx_f[] = { "en.pyx", + "stringsource", }; /* "spacy/lexeme.pxd":4 @@ -709,15 +710,15 @@ static CYTHON_INLINE int __Pyx_IterFinish(void); /*proto*/ static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected); /*proto*/ +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact); /*proto*/ + #include static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ -static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, - const char *name, int exact); /*proto*/ - static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact, Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); /*proto*/ @@ -808,6 +809,22 @@ static void __Pyx_WriteUnraisable(const char *name, int clineno, int lineno, const char *filename, int full_traceback); /*proto*/ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) { + PyListObject* L = (PyListObject*) list; + Py_ssize_t len = Py_SIZE(list); + if (likely(L->allocated > len)) { + Py_INCREF(x); + PyList_SET_ITEM(list, len, x); + Py_SIZE(list) = len+1; + return 0; + } + return PyList_Append(list, x); +} +#else +#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x) +#endif + static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name); /*proto*/ static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); /*proto*/ @@ -818,6 +835,45 @@ static CYTHON_INLINE uint64_t __Pyx_PyInt_As_uint64_t(PyObject *); static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *); +#ifndef __Pyx_CppExn2PyErr +#include +#include +#include +#include +static void __Pyx_CppExn2PyErr() { + try { + if (PyErr_Occurred()) + ; // let the latest Python exn pass through and ignore the current one + else + throw; + } catch (const std::bad_alloc& exn) { + PyErr_SetString(PyExc_MemoryError, exn.what()); + } catch (const std::bad_cast& exn) { + PyErr_SetString(PyExc_TypeError, exn.what()); + } catch (const std::domain_error& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::invalid_argument& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::ios_base::failure& exn) { + PyErr_SetString(PyExc_IOError, exn.what()); + } catch (const std::out_of_range& exn) { + PyErr_SetString(PyExc_IndexError, exn.what()); + } catch (const std::overflow_error& exn) { + PyErr_SetString(PyExc_OverflowError, exn.what()); + } catch (const std::range_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::underflow_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::exception& exn) { + PyErr_SetString(PyExc_RuntimeError, exn.what()); + } + catch (...) + { + PyErr_SetString(PyExc_RuntimeError, "Unknown exception"); + } +} +#endif + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_uint64_t(uint64_t value); static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); @@ -890,23 +946,28 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *, int static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject *, int, int, int __pyx_skip_dispatch); /*proto*/ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject *, size_t); /*proto*/ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash, int __pyx_skip_dispatch); /*proto*/ +static std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_f_5spacy_2en_tokenize(PyObject *, int __pyx_skip_dispatch); /*proto*/ +static CYTHON_INLINE int __pyx_f_5spacy_2en_is_whitespace(Py_UNICODE); /*proto*/ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *); /*proto*/ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *, int, int, size_t, int __pyx_skip_dispatch); /*proto*/ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5spacy_6lexeme_StringHash, PyObject *, int, size_t); /*proto*/ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyObject *, __pyx_t_5spacy_6lexeme_StringHash, int, size_t); /*proto*/ static size_t __pyx_f_5spacy_2en__find_split(PyObject *, size_t); /*proto*/ static int __pyx_f_5spacy_2en_is_punct(PyObject *, size_t, size_t); /*proto*/ +static PyObject *__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(const std::vector<__pyx_t_5spacy_2en_Lexeme_addr> &); /*proto*/ #define __Pyx_MODULE_NAME "spacy.en" int __pyx_module_is_main_spacy__en = 0; /* Implementation of 'spacy.en' */ static PyObject *__pyx_builtin_enumerate; +static PyObject *__pyx_builtin_range; static PyObject *__pyx_builtin_ValueError; static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_token_rules); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_2tokenize(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_4lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_6lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_8unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_10_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length); /* proto */ static char __pyx_k_[] = ""; static char __pyx_k_i[] = "i"; static char __pyx_k_s[] = "'s"; @@ -925,6 +986,7 @@ static char __pyx_k_chunk[] = "chunk"; static char __pyx_k_first[] = "first"; static char __pyx_k_last3[] = "last3"; static char __pyx_k_lower[] = "lower"; +static char __pyx_k_range[] = "range"; static char __pyx_k_s_d_s[] = "%s:@:%d:@:%s"; static char __pyx_k_start[] = "start"; static char __pyx_k_DIGITS[] = "!DIGITS"; @@ -979,6 +1041,7 @@ static PyObject *__pyx_n_s_oft_title; static PyObject *__pyx_n_s_oft_upper; static PyObject *__pyx_n_s_prob; static PyObject *__pyx_n_s_pyx_capi; +static PyObject *__pyx_n_s_range; static PyObject *__pyx_n_s_read_tokenization; static PyObject *__pyx_kp_u_s; static PyObject *__pyx_kp_u_s_d_s; @@ -999,7 +1062,7 @@ static PyObject *__pyx_tuple__2; static PyObject *__pyx_tuple__4; static PyObject *__pyx_codeobj__3; -/* "spacy/en.pyx":24 +/* "spacy/en.pyx":25 * * * def load_tokenization(token_rules): # <<<<<<<<<<<<<< @@ -1051,7 +1114,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ int __pyx_clineno = 0; __Pyx_RefNannySetupContext("load_tokenization", 0); - /* "spacy/en.pyx":27 + /* "spacy/en.pyx":28 * cdef Lexeme* word * cdef StringHash hashed * for chunk, lex, tokens in token_rules: # <<<<<<<<<<<<<< @@ -1062,7 +1125,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __pyx_t_1 = __pyx_v_token_rules; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0; __pyx_t_3 = NULL; } else { - __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_token_rules); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_token_rules); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; } @@ -1070,16 +1133,16 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ if (!__pyx_t_3 && PyList_CheckExact(__pyx_t_1)) { if (__pyx_t_2 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #endif } else if (!__pyx_t_3 && PyTuple_CheckExact(__pyx_t_1)) { if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #endif } else { __pyx_t_4 = __pyx_t_3(__pyx_t_1); @@ -1087,7 +1150,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } break; } @@ -1103,7 +1166,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ if (unlikely(size != 3)) { if (size > 3) __Pyx_RaiseTooManyValuesError(3); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } #if CYTHON_COMPILING_IN_CPYTHON if (likely(PyTuple_CheckExact(sequence))) { @@ -1119,17 +1182,17 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __Pyx_INCREF(__pyx_t_6); __Pyx_INCREF(__pyx_t_7); #else - __pyx_t_5 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_5); - __pyx_t_6 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - __pyx_t_7 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_7); #endif __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } else { Py_ssize_t index = -1; - __pyx_t_8 = PyObject_GetIter(__pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = PyObject_GetIter(__pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_9 = Py_TYPE(__pyx_t_8)->tp_iternext; @@ -1139,7 +1202,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __Pyx_GOTREF(__pyx_t_6); index = 2; __pyx_t_7 = __pyx_t_9(__pyx_t_8); if (unlikely(!__pyx_t_7)) goto __pyx_L5_unpacking_failed; __Pyx_GOTREF(__pyx_t_7); - if (__Pyx_IternextUnpackEndCheck(__pyx_t_9(__pyx_t_8), 3) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (__Pyx_IternextUnpackEndCheck(__pyx_t_9(__pyx_t_8), 3) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_9 = NULL; __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; goto __pyx_L6_unpacking_done; @@ -1147,7 +1210,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; __pyx_t_9 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_L6_unpacking_done:; } __Pyx_XDECREF_SET(__pyx_v_chunk, __pyx_t_5); @@ -1157,19 +1220,19 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __Pyx_XDECREF_SET(__pyx_v_tokens, __pyx_t_7); __pyx_t_7 = 0; - /* "spacy/en.pyx":28 + /* "spacy/en.pyx":29 * cdef StringHash hashed * for chunk, lex, tokens in token_rules: * hashed = hash_string(chunk, len(chunk)) # <<<<<<<<<<<<<< * assert LEXEMES[hashed] == NULL * word = _add(hashed, lex, len(lex), len(lex)) */ - if (!(likely(PyUnicode_CheckExact(__pyx_v_chunk))||((__pyx_v_chunk) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_chunk)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_10 = PyObject_Length(__pyx_v_chunk); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(((PyObject*)__pyx_v_chunk), __pyx_t_10); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_v_chunk))||((__pyx_v_chunk) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_chunk)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_chunk); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(((PyObject*)__pyx_v_chunk), __pyx_t_10); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_11; - /* "spacy/en.pyx":29 + /* "spacy/en.pyx":30 * for chunk, lex, tokens in token_rules: * hashed = hash_string(chunk, len(chunk)) * assert LEXEMES[hashed] == NULL # <<<<<<<<<<<<<< @@ -1180,25 +1243,25 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!(((__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]) == NULL) != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":30 + /* "spacy/en.pyx":31 * hashed = hash_string(chunk, len(chunk)) * assert LEXEMES[hashed] == NULL * word = _add(hashed, lex, len(lex), len(lex)) # <<<<<<<<<<<<<< * for i, lex in enumerate(tokens): * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) */ - if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_12 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_12 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), __pyx_t_10, __pyx_t_12); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_12 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_12 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), __pyx_t_10, __pyx_t_12); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word = __pyx_t_13; - /* "spacy/en.pyx":31 + /* "spacy/en.pyx":32 * assert LEXEMES[hashed] == NULL * word = _add(hashed, lex, len(lex), len(lex)) * for i, lex in enumerate(tokens): # <<<<<<<<<<<<<< @@ -1211,7 +1274,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __pyx_t_7 = __pyx_v_tokens; __Pyx_INCREF(__pyx_t_7); __pyx_t_12 = 0; __pyx_t_14 = NULL; } else { - __pyx_t_12 = -1; __pyx_t_7 = PyObject_GetIter(__pyx_v_tokens); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_12 = -1; __pyx_t_7 = PyObject_GetIter(__pyx_v_tokens); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_7); __pyx_t_14 = Py_TYPE(__pyx_t_7)->tp_iternext; } @@ -1219,16 +1282,16 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ if (!__pyx_t_14 && PyList_CheckExact(__pyx_t_7)) { if (__pyx_t_12 >= PyList_GET_SIZE(__pyx_t_7)) break; #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_6 = PyList_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyList_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #else - __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #endif } else if (!__pyx_t_14 && PyTuple_CheckExact(__pyx_t_7)) { if (__pyx_t_12 >= PyTuple_GET_SIZE(__pyx_t_7)) break; #if CYTHON_COMPILING_IN_CPYTHON - __pyx_t_6 = PyTuple_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyTuple_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #else - __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} #endif } else { __pyx_t_6 = __pyx_t_14(__pyx_t_7); @@ -1236,7 +1299,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } break; } @@ -1246,20 +1309,20 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __pyx_t_6 = 0; __Pyx_INCREF(__pyx_t_4); __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_4); - __pyx_t_6 = PyNumber_Add(__pyx_t_4, __pyx_int_1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyNumber_Add(__pyx_t_4, __pyx_int_1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = __pyx_t_6; __pyx_t_6 = 0; - /* "spacy/en.pyx":32 + /* "spacy/en.pyx":33 * word = _add(hashed, lex, len(lex), len(lex)) * for i, lex in enumerate(tokens): * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) # <<<<<<<<<<<<<< * length = len(token_string) * hashed = hash_string(token_string, length) */ - __pyx_t_6 = PyTuple_New(3); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyTuple_New(3); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __Pyx_INCREF(__pyx_v_chunk); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_chunk); @@ -1270,45 +1333,45 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ __Pyx_INCREF(__pyx_v_lex); PyTuple_SET_ITEM(__pyx_t_6, 2, __pyx_v_lex); __Pyx_GIVEREF(__pyx_v_lex); - __pyx_t_5 = PyUnicode_Format(__pyx_kp_u_s_d_s, __pyx_t_6); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = PyUnicode_Format(__pyx_kp_u_s_d_s, __pyx_t_6); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_XDECREF_SET(__pyx_v_token_string, ((PyObject*)__pyx_t_5)); __pyx_t_5 = 0; - /* "spacy/en.pyx":33 + /* "spacy/en.pyx":34 * for i, lex in enumerate(tokens): * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) * length = len(token_string) # <<<<<<<<<<<<<< * hashed = hash_string(token_string, length) * word.tail = _add(hashed, lex, 0, len(lex)) */ - __pyx_t_10 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_token_string); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_token_string); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_length = __pyx_t_10; - /* "spacy/en.pyx":34 + /* "spacy/en.pyx":35 * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) * length = len(token_string) * hashed = hash_string(token_string, length) # <<<<<<<<<<<<<< * word.tail = _add(hashed, lex, 0, len(lex)) * word = word.tail */ - __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(__pyx_v_token_string, __pyx_v_length); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(__pyx_v_token_string, __pyx_v_length); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_11; - /* "spacy/en.pyx":35 + /* "spacy/en.pyx":36 * length = len(token_string) * hashed = hash_string(token_string, length) * word.tail = _add(hashed, lex, 0, len(lex)) # <<<<<<<<<<<<<< * word = word.tail * */ - if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), 0, __pyx_t_10); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), 0, __pyx_t_10); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->tail = __pyx_t_13; - /* "spacy/en.pyx":36 + /* "spacy/en.pyx":37 * hashed = hash_string(token_string, length) * word.tail = _add(hashed, lex, 0, len(lex)) * word = word.tail # <<<<<<<<<<<<<< @@ -1323,7 +1386,7 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "spacy/en.pyx":24 + /* "spacy/en.pyx":25 * * * def load_tokenization(token_rules): # <<<<<<<<<<<<<< @@ -1354,15 +1417,454 @@ static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *_ return __pyx_r; } -/* "spacy/en.pyx":41 - * load_tokenization(util.read_tokenization('en')) +/* "spacy/en.pyx":43 + * + * + * cpdef vector[Lexeme_addr] tokenize(unicode string) except *: # <<<<<<<<<<<<<< + * cdef size_t length = len(string) + * cdef Py_UNICODE* characters = string + */ + +static PyObject *__pyx_pw_5spacy_2en_3tokenize(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_f_5spacy_2en_tokenize(PyObject *__pyx_v_string, CYTHON_UNUSED int __pyx_skip_dispatch) { + size_t __pyx_v_length; + Py_UNICODE *__pyx_v_characters; + size_t __pyx_v_i; + Py_UNICODE __pyx_v_c; + std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_v_tokens; + PyObject *__pyx_v_current = 0; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_token; + std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_r; + __Pyx_RefNannyDeclarations + Py_ssize_t __pyx_t_1; + Py_UNICODE *__pyx_t_2; + std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_t_3; + size_t __pyx_t_4; + size_t __pyx_t_5; + int __pyx_t_6; + __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_7; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_8; + PyObject *__pyx_t_9 = NULL; + PyObject *__pyx_t_10 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("tokenize", 0); + + /* "spacy/en.pyx":44 + * + * cpdef vector[Lexeme_addr] tokenize(unicode string) except *: + * cdef size_t length = len(string) # <<<<<<<<<<<<<< + * cdef Py_UNICODE* characters = string + * + */ + if (unlikely(__pyx_v_string == Py_None)) { + PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_t_1 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_1 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_length = __pyx_t_1; + + /* "spacy/en.pyx":45 + * cpdef vector[Lexeme_addr] tokenize(unicode string) except *: + * cdef size_t length = len(string) + * cdef Py_UNICODE* characters = string # <<<<<<<<<<<<<< + * + * cdef size_t i + */ + __pyx_t_2 = __Pyx_PyUnicode_AsUnicode(__pyx_v_string); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_characters = ((Py_UNICODE *)__pyx_t_2); + + /* "spacy/en.pyx":50 + * cdef Py_UNICODE c + * + * cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() # <<<<<<<<<<<<<< + * cdef unicode current = u'' + * cdef Lexeme* token + */ + try { + __pyx_t_3 = std::vector<__pyx_t_5spacy_2en_Lexeme_addr>(); + } catch(...) { + __Pyx_CppExn2PyErr(); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + __pyx_v_tokens = __pyx_t_3; + + /* "spacy/en.pyx":51 + * + * cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() + * cdef unicode current = u'' # <<<<<<<<<<<<<< + * cdef Lexeme* token + * for i in range(length): + */ + __Pyx_INCREF(__pyx_kp_u_); + __pyx_v_current = __pyx_kp_u_; + + /* "spacy/en.pyx":53 + * cdef unicode current = u'' + * cdef Lexeme* token + * for i in range(length): # <<<<<<<<<<<<<< + * c = characters[i] + * if is_whitespace(c): + */ + __pyx_t_4 = __pyx_v_length; + for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) { + __pyx_v_i = __pyx_t_5; + + /* "spacy/en.pyx":54 + * cdef Lexeme* token + * for i in range(length): + * c = characters[i] # <<<<<<<<<<<<<< + * if is_whitespace(c): + * if current: + */ + __pyx_v_c = (__pyx_v_characters[__pyx_v_i]); + + /* "spacy/en.pyx":55 + * for i in range(length): + * c = characters[i] + * if is_whitespace(c): # <<<<<<<<<<<<<< + * if current: + * token = lookup(current) + */ + __pyx_t_6 = (__pyx_f_5spacy_2en_is_whitespace(__pyx_v_c) != 0); + if (__pyx_t_6) { + + /* "spacy/en.pyx":56 + * c = characters[i] + * if is_whitespace(c): + * if current: # <<<<<<<<<<<<<< + * token = lookup(current) + * while token != NULL: + */ + __pyx_t_6 = (__pyx_v_current != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_current) != 0); + if (__pyx_t_6) { + + /* "spacy/en.pyx":57 + * if is_whitespace(c): + * if current: + * token = lookup(current) # <<<<<<<<<<<<<< + * while token != NULL: + * tokens.push_back(token) + */ + __pyx_t_7 = __pyx_f_5spacy_2en_lookup(__pyx_v_current, 0); if (unlikely(__pyx_t_7 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_token = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_t_7); + + /* "spacy/en.pyx":58 + * if current: + * token = lookup(current) + * while token != NULL: # <<<<<<<<<<<<<< + * tokens.push_back(token) + * token = token.tail + */ + while (1) { + __pyx_t_6 = ((__pyx_v_token != NULL) != 0); + if (!__pyx_t_6) break; + + /* "spacy/en.pyx":59 + * token = lookup(current) + * while token != NULL: + * tokens.push_back(token) # <<<<<<<<<<<<<< + * token = token.tail + * current = u'' + */ + __pyx_v_tokens.push_back(((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_token)); + + /* "spacy/en.pyx":60 + * while token != NULL: + * tokens.push_back(token) + * token = token.tail # <<<<<<<<<<<<<< + * current = u'' + * else: + */ + __pyx_t_8 = __pyx_v_token->tail; + __pyx_v_token = __pyx_t_8; + } + goto __pyx_L6; + } + __pyx_L6:; + + /* "spacy/en.pyx":61 + * tokens.push_back(token) + * token = token.tail + * current = u'' # <<<<<<<<<<<<<< + * else: + * current += c + */ + __Pyx_INCREF(__pyx_kp_u_); + __Pyx_DECREF_SET(__pyx_v_current, __pyx_kp_u_); + goto __pyx_L5; + } + /*else*/ { + + /* "spacy/en.pyx":63 + * current = u'' + * else: + * current += c # <<<<<<<<<<<<<< + * if current: + * token = lookup(current) + */ + __pyx_t_9 = PyUnicode_FromOrdinal(__pyx_v_c); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_9); + __pyx_t_10 = __Pyx_PyUnicode_ConcatSafe(__pyx_v_current, __pyx_t_9); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_10); + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; + __Pyx_DECREF_SET(__pyx_v_current, ((PyObject*)__pyx_t_10)); + __pyx_t_10 = 0; + } + __pyx_L5:; + } + + /* "spacy/en.pyx":64 + * else: + * current += c + * if current: # <<<<<<<<<<<<<< + * token = lookup(current) + * while token != NULL: + */ + __pyx_t_6 = (__pyx_v_current != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_current) != 0); + if (__pyx_t_6) { + + /* "spacy/en.pyx":65 + * current += c + * if current: + * token = lookup(current) # <<<<<<<<<<<<<< + * while token != NULL: + * tokens.push_back(token) + */ + __pyx_t_7 = __pyx_f_5spacy_2en_lookup(__pyx_v_current, 0); if (unlikely(__pyx_t_7 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_token = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_t_7); + + /* "spacy/en.pyx":66 + * if current: + * token = lookup(current) + * while token != NULL: # <<<<<<<<<<<<<< + * tokens.push_back(token) + * token = token.tail + */ + while (1) { + __pyx_t_6 = ((__pyx_v_token != NULL) != 0); + if (!__pyx_t_6) break; + + /* "spacy/en.pyx":67 + * token = lookup(current) + * while token != NULL: + * tokens.push_back(token) # <<<<<<<<<<<<<< + * token = token.tail + * return tokens + */ + __pyx_v_tokens.push_back(((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_token)); + + /* "spacy/en.pyx":68 + * while token != NULL: + * tokens.push_back(token) + * token = token.tail # <<<<<<<<<<<<<< + * return tokens + * + */ + __pyx_t_8 = __pyx_v_token->tail; + __pyx_v_token = __pyx_t_8; + } + goto __pyx_L9; + } + __pyx_L9:; + + /* "spacy/en.pyx":69 + * tokens.push_back(token) + * token = token.tail + * return tokens # <<<<<<<<<<<<<< + * + * cdef inline bint is_whitespace(Py_UNICODE c): + */ + __pyx_r = __pyx_v_tokens; + goto __pyx_L0; + + /* "spacy/en.pyx":43 + * + * + * cpdef vector[Lexeme_addr] tokenize(unicode string) except *: # <<<<<<<<<<<<<< + * cdef size_t length = len(string) + * cdef Py_UNICODE* characters = string + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_9); + __Pyx_XDECREF(__pyx_t_10); + __Pyx_AddTraceback("spacy.en.tokenize", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_current); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_3tokenize(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_3tokenize(PyObject *__pyx_self, PyObject *__pyx_v_string) { + CYTHON_UNUSED int __pyx_lineno = 0; + CYTHON_UNUSED const char *__pyx_filename = NULL; + CYTHON_UNUSED int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("tokenize (wrapper)", 0); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_2tokenize(__pyx_self, ((PyObject*)__pyx_v_string)); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_2tokenize(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + std::vector<__pyx_t_5spacy_2en_Lexeme_addr> __pyx_t_1; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("tokenize", 0); + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __pyx_f_5spacy_2en_tokenize(__pyx_v_string, 0); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("spacy.en.tokenize", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":71 + * return tokens + * + * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + */ + +static CYTHON_INLINE int __pyx_f_5spacy_2en_is_whitespace(Py_UNICODE __pyx_v_c) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("is_whitespace", 0); + + /* "spacy/en.pyx":78 + * elif c == u'\n': + * return True + * elif c == u'\t': # <<<<<<<<<<<<<< + * return True + * else: + */ + switch (__pyx_v_c) { + + /* "spacy/en.pyx":74 + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + * if c == u' ': # <<<<<<<<<<<<<< + * return True + * elif c == u'\n': + */ + case 32: + + /* "spacy/en.pyx":75 + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + * if c == u' ': + * return True # <<<<<<<<<<<<<< + * elif c == u'\n': + * return True + */ + __pyx_r = 1; + goto __pyx_L0; + break; + + /* "spacy/en.pyx":76 + * if c == u' ': + * return True + * elif c == u'\n': # <<<<<<<<<<<<<< + * return True + * elif c == u'\t': + */ + case 10: + + /* "spacy/en.pyx":77 + * return True + * elif c == u'\n': + * return True # <<<<<<<<<<<<<< + * elif c == u'\t': + * return True + */ + __pyx_r = 1; + goto __pyx_L0; + break; + + /* "spacy/en.pyx":78 + * elif c == u'\n': + * return True + * elif c == u'\t': # <<<<<<<<<<<<<< + * return True + * else: + */ + case 9: + + /* "spacy/en.pyx":79 + * return True + * elif c == u'\t': + * return True # <<<<<<<<<<<<<< + * else: + * return False + */ + __pyx_r = 1; + goto __pyx_L0; + break; + default: + + /* "spacy/en.pyx":81 + * return True + * else: + * return False # <<<<<<<<<<<<<< + * + * cpdef Lexeme_addr lookup(unicode string) except 0: + */ + __pyx_r = 0; + goto __pyx_L0; + break; + } + + /* "spacy/en.pyx":71 + * return tokens + * + * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< + * # TODO: Support other unicode spaces + * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + */ + + /* function exit code */ + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":83 + * return False * * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< * '''.. function:: enumerate(sequence[, start=0]) * Fetch a Lexeme representing a word string. If the word has not been seen, */ -static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_5lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_v_string, CYTHON_UNUSED int __pyx_skip_dispatch) { size_t __pyx_v_length; __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; @@ -1379,18 +1881,18 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup", 0); - /* "spacy/en.pyx":49 + /* "spacy/en.pyx":91 * To specify the boundaries of the word if it has not been seen, use lookup_chunk. * ''' * if string == '': # <<<<<<<<<<<<<< * return &BLANK_WORD * cdef size_t length = len(string) */ - __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 91; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_2 = (__pyx_t_1 != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":50 + /* "spacy/en.pyx":92 * ''' * if string == '': * return &BLANK_WORD # <<<<<<<<<<<<<< @@ -1401,7 +1903,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ goto __pyx_L0; } - /* "spacy/en.pyx":51 + /* "spacy/en.pyx":93 * if string == '': * return &BLANK_WORD * cdef size_t length = len(string) # <<<<<<<<<<<<<< @@ -1410,22 +1912,22 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ */ if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 93; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 93; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_length = __pyx_t_3; - /* "spacy/en.pyx":52 + /* "spacy/en.pyx":94 * return &BLANK_WORD * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< * cdef Lexeme* word_ptr = LEXEMES[hashed] * cdef size_t n */ - __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 94; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_4; - /* "spacy/en.pyx":53 + /* "spacy/en.pyx":95 * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* word_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< @@ -1434,7 +1936,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ */ __pyx_v_word_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); - /* "spacy/en.pyx":55 + /* "spacy/en.pyx":97 * cdef Lexeme* word_ptr = LEXEMES[hashed] * cdef size_t n * if word_ptr == NULL: # <<<<<<<<<<<<<< @@ -1444,20 +1946,20 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ __pyx_t_2 = ((__pyx_v_word_ptr == NULL) != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":56 + /* "spacy/en.pyx":98 * cdef size_t n * if word_ptr == NULL: * word_ptr = _add(hashed, string, _find_split(string, length), length) # <<<<<<<<<<<<<< * return word_ptr * */ - __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_f_5spacy_2en__find_split(__pyx_v_string, __pyx_v_length), __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_f_5spacy_2en__find_split(__pyx_v_string, __pyx_v_length), __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 98; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word_ptr = __pyx_t_5; goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":57 + /* "spacy/en.pyx":99 * if word_ptr == NULL: * word_ptr = _add(hashed, string, _find_split(string, length), length) * return word_ptr # <<<<<<<<<<<<<< @@ -1467,8 +1969,8 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_word_ptr); goto __pyx_L0; - /* "spacy/en.pyx":41 - * load_tokenization(util.read_tokenization('en')) + /* "spacy/en.pyx":83 + * return False * * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< * '''.. function:: enumerate(sequence[, start=0]) @@ -1485,17 +1987,17 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ -static char __pyx_doc_5spacy_2en_2lookup[] = ".. function:: enumerate(sequence[, start=0])\n Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, splitting off any attached punctuation or clitics. A\n reference to BLANK_WORD is returned for the empty string.\n \n To specify the boundaries of the word if it has not been seen, use lookup_chunk.\n "; -static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string) { +static PyObject *__pyx_pw_5spacy_2en_5lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static char __pyx_doc_5spacy_2en_4lookup[] = ".. function:: enumerate(sequence[, start=0])\n Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, splitting off any attached punctuation or clitics. A\n reference to BLANK_WORD is returned for the empty string.\n \n To specify the boundaries of the word if it has not been seen, use lookup_chunk.\n "; +static PyObject *__pyx_pw_5spacy_2en_5lookup(PyObject *__pyx_self, PyObject *__pyx_v_string) { CYTHON_UNUSED int __pyx_lineno = 0; CYTHON_UNUSED const char *__pyx_filename = NULL; CYTHON_UNUSED int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("lookup (wrapper)", 0); - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_2lookup(__pyx_self, ((PyObject*)__pyx_v_string)); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 83; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_4lookup(__pyx_self, ((PyObject*)__pyx_v_string)); /* function exit code */ goto __pyx_L0; @@ -1506,7 +2008,7 @@ static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__p return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { +static PyObject *__pyx_pf_5spacy_2en_4lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; @@ -1516,8 +2018,8 @@ static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_lookup(__pyx_v_string, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_lookup(__pyx_v_string, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 83; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 83; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __pyx_r = __pyx_t_2; __pyx_t_2 = 0; @@ -1534,7 +2036,7 @@ static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, return __pyx_r; } -/* "spacy/en.pyx":60 +/* "spacy/en.pyx":102 * * * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< @@ -1542,7 +2044,7 @@ static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, * construct one, given the specified start and end indices. A negative index */ -static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_7lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject *__pyx_v_string, int __pyx_v_start, CYTHON_UNUSED int __pyx_v_end, CYTHON_UNUSED int __pyx_skip_dispatch) { size_t __pyx_v_length; __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; @@ -1559,18 +2061,18 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup_chunk", 0); - /* "spacy/en.pyx":68 + /* "spacy/en.pyx":110 * A reference to BLANK_WORD is returned for the empty string. * ''' * if string == '': # <<<<<<<<<<<<<< * return &BLANK_WORD * cdef size_t length = len(string) */ - __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 110; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_2 = (__pyx_t_1 != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":69 + /* "spacy/en.pyx":111 * ''' * if string == '': * return &BLANK_WORD # <<<<<<<<<<<<<< @@ -1581,7 +2083,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * goto __pyx_L0; } - /* "spacy/en.pyx":70 + /* "spacy/en.pyx":112 * if string == '': * return &BLANK_WORD * cdef size_t length = len(string) # <<<<<<<<<<<<<< @@ -1590,22 +2092,22 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * */ if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 112; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 112; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_length = __pyx_t_3; - /* "spacy/en.pyx":71 + /* "spacy/en.pyx":113 * return &BLANK_WORD * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: */ - __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 71; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 113; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_4; - /* "spacy/en.pyx":72 + /* "spacy/en.pyx":114 * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* chunk_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< @@ -1614,7 +2116,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * */ __pyx_v_chunk_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); - /* "spacy/en.pyx":73 + /* "spacy/en.pyx":115 * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: # <<<<<<<<<<<<<< @@ -1624,20 +2126,20 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * __pyx_t_2 = ((__pyx_v_chunk_ptr == NULL) != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":74 + /* "spacy/en.pyx":116 * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: * chunk_ptr = _add(hashed, string, start, length) # <<<<<<<<<<<<<< * return chunk_ptr * */ - __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_v_start, __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 74; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_v_start, __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 116; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_chunk_ptr = __pyx_t_5; goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":75 + /* "spacy/en.pyx":117 * if chunk_ptr == NULL: * chunk_ptr = _add(hashed, string, start, length) * return chunk_ptr # <<<<<<<<<<<<<< @@ -1647,7 +2149,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_chunk_ptr); goto __pyx_L0; - /* "spacy/en.pyx":60 + /* "spacy/en.pyx":102 * * * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< @@ -1665,9 +2167,9 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_5spacy_2en_4lookup_chunk[] = "Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, given the specified start and end indices. A negative index\n significes 0 for start, and the string length for end --- i.e. the string\n will not be sliced if start == -1 and end == -1.\n \n A reference to BLANK_WORD is returned for the empty string.\n "; -static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { +static PyObject *__pyx_pw_5spacy_2en_7lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5spacy_2en_6lookup_chunk[] = "Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, given the specified start and end indices. A negative index\n significes 0 for start, and the string length for end --- i.e. the string\n will not be sliced if start == -1 and end == -1.\n \n A reference to BLANK_WORD is returned for the empty string.\n "; +static PyObject *__pyx_pw_5spacy_2en_7lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_string = 0; int __pyx_v_start; int __pyx_v_end; @@ -1698,16 +2200,16 @@ static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObjec case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "lookup_chunk") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "lookup_chunk") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { goto __pyx_L5_argtuple_error; @@ -1717,19 +2219,19 @@ static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObjec values[2] = PyTuple_GET_ITEM(__pyx_args, 2); } __pyx_v_string = ((PyObject*)values[0]); - __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L3_error;} __pyx_L3_error:; __Pyx_AddTraceback("spacy.en.lookup_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_4lookup_chunk(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_6lookup_chunk(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end); /* function exit code */ goto __pyx_L0; @@ -1740,7 +2242,7 @@ static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObjec return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end) { +static PyObject *__pyx_pf_5spacy_2en_6lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; @@ -1750,8 +2252,8 @@ static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup_chunk", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_lookup_chunk(__pyx_v_string, __pyx_v_start, __pyx_v_end, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_lookup_chunk(__pyx_v_string, __pyx_v_start, __pyx_v_end, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __pyx_r = __pyx_t_2; __pyx_t_2 = 0; @@ -1768,7 +2270,7 @@ static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx return __pyx_r; } -/* "spacy/en.pyx":78 +/* "spacy/en.pyx":120 * * * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< @@ -1785,7 +2287,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject int __pyx_clineno = 0; __Pyx_RefNannySetupContext("hash_string", 0); - /* "spacy/en.pyx":80 + /* "spacy/en.pyx":122 * cdef StringHash hash_string(unicode s, size_t length) except 0: * '''Hash unicode with MurmurHash64A''' * assert length # <<<<<<<<<<<<<< @@ -1796,23 +2298,23 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!(__pyx_v_length != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 80; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 122; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":81 + /* "spacy/en.pyx":123 * '''Hash unicode with MurmurHash64A''' * assert length * return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) # <<<<<<<<<<<<<< * * */ - __pyx_t_1 = __Pyx_PyUnicode_AsUnicode(__pyx_v_s); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyUnicode_AsUnicode(__pyx_v_s); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 123; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = MurmurHash64A(((__pyx_t_5spacy_2en_string_ptr)__pyx_t_1), (__pyx_v_length * (sizeof(Py_UNICODE))), 0); goto __pyx_L0; - /* "spacy/en.pyx":78 + /* "spacy/en.pyx":120 * * * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< @@ -1829,7 +2331,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject return __pyx_r; } -/* "spacy/en.pyx":84 +/* "spacy/en.pyx":126 * * * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< @@ -1837,7 +2339,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject * cdef string_ptr string = STRINGS[hash_value] */ -static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_9unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value, CYTHON_UNUSED int __pyx_skip_dispatch) { __pyx_t_5spacy_2en_string_ptr __pyx_v_string; PyObject *__pyx_r = NULL; @@ -1851,23 +2353,23 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p int __pyx_clineno = 0; __Pyx_RefNannySetupContext("unhash", 0); - /* "spacy/en.pyx":86 + /* "spacy/en.pyx":128 * cpdef unicode unhash(StringHash hash_value): * '''Fetch a string from the reverse index, given its hash value.''' * cdef string_ptr string = STRINGS[hash_value] # <<<<<<<<<<<<<< * if string == NULL: * raise ValueError(hash_value) */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_hash_value, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_hash_value, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_3 = __Pyx_PyUnicode_AsUnicode(__pyx_t_2); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_AsUnicode(__pyx_t_2); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_v_string = __pyx_t_3; - /* "spacy/en.pyx":87 + /* "spacy/en.pyx":129 * '''Fetch a string from the reverse index, given its hash value.''' * cdef string_ptr string = STRINGS[hash_value] * if string == NULL: # <<<<<<<<<<<<<< @@ -1877,29 +2379,29 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p __pyx_t_4 = ((__pyx_v_string == NULL) != 0); if (__pyx_t_4) { - /* "spacy/en.pyx":88 + /* "spacy/en.pyx":130 * cdef string_ptr string = STRINGS[hash_value] * if string == NULL: * raise ValueError(hash_value) # <<<<<<<<<<<<<< * * return string */ - __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_v_hash_value); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_v_hash_value); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2); __Pyx_GIVEREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_Raise(__pyx_t_2, 0, 0, 0); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - /* "spacy/en.pyx":90 + /* "spacy/en.pyx":132 * raise ValueError(hash_value) * * return string # <<<<<<<<<<<<<< @@ -1907,14 +2409,14 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_2 = __Pyx_PyUnicode_FromUnicode(__pyx_v_string); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyUnicode_FromUnicode(__pyx_v_string); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 132; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - if (!(likely(PyUnicode_CheckExact(__pyx_t_2))||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_t_2))||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 132; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = ((PyObject*)__pyx_t_2); __pyx_t_2 = 0; goto __pyx_L0; - /* "spacy/en.pyx":84 + /* "spacy/en.pyx":126 * * * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< @@ -1935,9 +2437,9 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ -static char __pyx_doc_5spacy_2en_6unhash[] = "Fetch a string from the reverse index, given its hash value."; -static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value) { +static PyObject *__pyx_pw_5spacy_2en_9unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static char __pyx_doc_5spacy_2en_8unhash[] = "Fetch a string from the reverse index, given its hash value."; +static PyObject *__pyx_pw_5spacy_2en_9unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value) { __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value; int __pyx_lineno = 0; const char *__pyx_filename = NULL; @@ -1946,7 +2448,7 @@ static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__p __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("unhash (wrapper)", 0); assert(__pyx_arg_hash_value); { - __pyx_v_hash_value = __Pyx_PyInt_As_uint64_t(__pyx_arg_hash_value); if (unlikely((__pyx_v_hash_value == (uint64_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_hash_value = __Pyx_PyInt_As_uint64_t(__pyx_arg_hash_value); if (unlikely((__pyx_v_hash_value == (uint64_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 126; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L3_error:; @@ -1954,14 +2456,14 @@ static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__p __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - __pyx_r = __pyx_pf_5spacy_2en_6unhash(__pyx_self, ((__pyx_t_5spacy_6lexeme_StringHash)__pyx_v_hash_value)); + __pyx_r = __pyx_pf_5spacy_2en_8unhash(__pyx_self, ((__pyx_t_5spacy_6lexeme_StringHash)__pyx_v_hash_value)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value) { +static PyObject *__pyx_pf_5spacy_2en_8unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1970,7 +2472,7 @@ static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_clineno = 0; __Pyx_RefNannySetupContext("unhash", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_unhash(__pyx_v_hash_value, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_unhash(__pyx_v_hash_value, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 126; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -1987,7 +2489,7 @@ static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, return __pyx_r; } -/* "spacy/en.pyx":93 +/* "spacy/en.pyx":135 * * * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< @@ -2010,26 +2512,26 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word int __pyx_clineno = 0; __Pyx_RefNannySetupContext("normalize_word_string", 0); - /* "spacy/en.pyx":100 + /* "spacy/en.pyx":142 * ''' * cdef unicode s * if word.isdigit() and len(word) == 4: # <<<<<<<<<<<<<< * return '!YEAR' * elif word[0].isdigit(): */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_isdigit); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_isdigit); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; if (__pyx_t_3) { if (unlikely(__pyx_v_word == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_4 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_word); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_word); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_5 = (__pyx_t_4 == 4); __pyx_t_6 = __pyx_t_5; } else { @@ -2037,7 +2539,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word } if (__pyx_t_6) { - /* "spacy/en.pyx":101 + /* "spacy/en.pyx":143 * cdef unicode s * if word.isdigit() and len(word) == 4: * return '!YEAR' # <<<<<<<<<<<<<< @@ -2050,18 +2552,18 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word goto __pyx_L0; } - /* "spacy/en.pyx":102 + /* "spacy/en.pyx":144 * if word.isdigit() and len(word) == 4: * return '!YEAR' * elif word[0].isdigit(): # <<<<<<<<<<<<<< * return '!DIGITS' * else: */ - __pyx_t_7 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_7 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_7 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_7 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 144; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __pyx_t_6 = Py_UNICODE_ISDIGIT(__pyx_t_7); if ((__pyx_t_6 != 0)) { - /* "spacy/en.pyx":103 + /* "spacy/en.pyx":145 * return '!YEAR' * elif word[0].isdigit(): * return '!DIGITS' # <<<<<<<<<<<<<< @@ -2075,7 +2577,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word } /*else*/ { - /* "spacy/en.pyx":105 + /* "spacy/en.pyx":147 * return '!DIGITS' * else: * return word.lower() # <<<<<<<<<<<<<< @@ -2083,18 +2585,18 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 147; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 147; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - if (!(likely(PyUnicode_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 147; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = ((PyObject*)__pyx_t_1); __pyx_t_1 = 0; goto __pyx_L0; } - /* "spacy/en.pyx":93 + /* "spacy/en.pyx":135 * * * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< @@ -2114,7 +2616,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word return __pyx_r; } -/* "spacy/en.pyx":108 +/* "spacy/en.pyx":150 * * * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< @@ -2122,7 +2624,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word * end = -1 */ -static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_11_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length, CYTHON_UNUSED int __pyx_skip_dispatch) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations @@ -2135,7 +2637,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_substr", 0); - /* "spacy/en.pyx":109 + /* "spacy/en.pyx":151 * * cpdef unicode _substr(unicode string, int start, int end, size_t length): * if end >= length: # <<<<<<<<<<<<<< @@ -2145,7 +2647,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_1 = ((__pyx_v_end >= __pyx_v_length) != 0); if (__pyx_t_1) { - /* "spacy/en.pyx":110 + /* "spacy/en.pyx":152 * cpdef unicode _substr(unicode string, int start, int end, size_t length): * if end >= length: * end = -1 # <<<<<<<<<<<<<< @@ -2157,7 +2659,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L3:; - /* "spacy/en.pyx":111 + /* "spacy/en.pyx":153 * if end >= length: * end = -1 * if start >= length: # <<<<<<<<<<<<<< @@ -2167,7 +2669,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_1 = ((__pyx_v_start >= __pyx_v_length) != 0); if (__pyx_t_1) { - /* "spacy/en.pyx":112 + /* "spacy/en.pyx":154 * end = -1 * if start >= length: * start = 0 # <<<<<<<<<<<<<< @@ -2179,7 +2681,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L4:; - /* "spacy/en.pyx":113 + /* "spacy/en.pyx":155 * if start >= length: * start = 0 * if start <= 0 and end < 0: # <<<<<<<<<<<<<< @@ -2195,7 +2697,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } if (__pyx_t_3) { - /* "spacy/en.pyx":114 + /* "spacy/en.pyx":156 * start = 0 * if start <= 0 and end < 0: * return string # <<<<<<<<<<<<<< @@ -2208,7 +2710,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ goto __pyx_L0; } - /* "spacy/en.pyx":115 + /* "spacy/en.pyx":157 * if start <= 0 and end < 0: * return string * elif start < 0: # <<<<<<<<<<<<<< @@ -2218,7 +2720,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_3 = ((__pyx_v_start < 0) != 0); if (__pyx_t_3) { - /* "spacy/en.pyx":116 + /* "spacy/en.pyx":158 * return string * elif start < 0: * start = 0 # <<<<<<<<<<<<<< @@ -2229,7 +2731,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ goto __pyx_L5; } - /* "spacy/en.pyx":117 + /* "spacy/en.pyx":159 * elif start < 0: * start = 0 * elif end < 0: # <<<<<<<<<<<<<< @@ -2239,7 +2741,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_3 = ((__pyx_v_end < 0) != 0); if (__pyx_t_3) { - /* "spacy/en.pyx":118 + /* "spacy/en.pyx":160 * start = 0 * elif end < 0: * end = length # <<<<<<<<<<<<<< @@ -2251,7 +2753,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L5:; - /* "spacy/en.pyx":119 + /* "spacy/en.pyx":161 * elif end < 0: * end = length * return string[start:end] # <<<<<<<<<<<<<< @@ -2261,15 +2763,15 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __Pyx_XDECREF(__pyx_r); if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 119; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 161; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_4 = __Pyx_PyUnicode_Substring(__pyx_v_string, __pyx_v_start, __pyx_v_end); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 119; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __Pyx_PyUnicode_Substring(__pyx_v_string, __pyx_v_start, __pyx_v_end); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 161; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); __pyx_r = ((PyObject*)__pyx_t_4); __pyx_t_4 = 0; goto __pyx_L0; - /* "spacy/en.pyx":108 + /* "spacy/en.pyx":150 * * * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< @@ -2289,8 +2791,8 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { +static PyObject *__pyx_pw_5spacy_2en_11_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_11_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_string = 0; int __pyx_v_start; int __pyx_v_end; @@ -2323,21 +2825,21 @@ static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__ case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 3: if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_length)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_substr") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_substr") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } else if (PyTuple_GET_SIZE(__pyx_args) != 4) { goto __pyx_L5_argtuple_error; @@ -2348,20 +2850,20 @@ static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__ values[3] = PyTuple_GET_ITEM(__pyx_args, 3); } __pyx_v_string = ((PyObject*)values[0]); - __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_length = __Pyx_PyInt_As_size_t(values[3]); if (unlikely((__pyx_v_length == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_length = __Pyx_PyInt_As_size_t(values[3]); if (unlikely((__pyx_v_length == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L3_error;} __pyx_L3_error:; __Pyx_AddTraceback("spacy.en._substr", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_8_substr(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_10_substr(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length); /* function exit code */ goto __pyx_L0; @@ -2372,7 +2874,7 @@ static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__ return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length) { +static PyObject *__pyx_pf_5spacy_2en_10_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -2381,7 +2883,7 @@ static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_substr", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -2398,7 +2900,7 @@ static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self return __pyx_r; } -/* "spacy/en.pyx":122 +/* "spacy/en.pyx":164 * * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< @@ -2418,7 +2920,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_add", 0); - /* "spacy/en.pyx":123 + /* "spacy/en.pyx":165 * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: * assert string # <<<<<<<<<<<<<< @@ -2430,12 +2932,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp __pyx_t_1 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); if (unlikely(!__pyx_t_1)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 123; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":124 + /* "spacy/en.pyx":166 * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: * assert string * assert split <= length # <<<<<<<<<<<<<< @@ -2446,22 +2948,22 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 124; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 166; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":125 + /* "spacy/en.pyx":167 * assert string * assert split <= length * word = _init_lexeme(string, hashed, split, length) # <<<<<<<<<<<<<< * LEXEMES[hashed] = word * STRINGS[hashed] = string */ - __pyx_t_2 = __pyx_f_5spacy_2en__init_lexeme(__pyx_v_string, __pyx_v_hashed, __pyx_v_split, __pyx_v_length); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 125; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __pyx_f_5spacy_2en__init_lexeme(__pyx_v_string, __pyx_v_hashed, __pyx_v_split, __pyx_v_length); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 167; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word = __pyx_t_2; - /* "spacy/en.pyx":126 + /* "spacy/en.pyx":168 * assert split <= length * word = _init_lexeme(string, hashed, split, length) * LEXEMES[hashed] = word # <<<<<<<<<<<<<< @@ -2470,19 +2972,19 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp */ (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]) = __pyx_v_word; - /* "spacy/en.pyx":127 + /* "spacy/en.pyx":169 * word = _init_lexeme(string, hashed, split, length) * LEXEMES[hashed] = word * STRINGS[hashed] = string # <<<<<<<<<<<<<< * return word * */ - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_3); - if (unlikely(__Pyx_SetItemInt(__pyx_t_3, __pyx_v_hashed, __pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_3, __pyx_v_hashed, __pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "spacy/en.pyx":128 + /* "spacy/en.pyx":170 * LEXEMES[hashed] = word * STRINGS[hashed] = string * return word # <<<<<<<<<<<<<< @@ -2492,7 +2994,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp __pyx_r = __pyx_v_word; goto __pyx_L0; - /* "spacy/en.pyx":122 + /* "spacy/en.pyx":164 * * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< @@ -2510,7 +3012,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp return __pyx_r; } -/* "spacy/en.pyx":131 +/* "spacy/en.pyx":173 * * * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< @@ -2540,7 +3042,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_init_lexeme", 0); - /* "spacy/en.pyx":133 + /* "spacy/en.pyx":175 * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, * int split, size_t length) except NULL: * assert split <= length # <<<<<<<<<<<<<< @@ -2551,12 +3053,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 133; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 175; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":134 + /* "spacy/en.pyx":176 * int split, size_t length) except NULL: * assert split <= length * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) # <<<<<<<<<<<<<< @@ -2565,7 +3067,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)calloc(1, (sizeof(struct __pyx_t_5spacy_6lexeme_Lexeme)))); - /* "spacy/en.pyx":136 + /* "spacy/en.pyx":178 * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) * * word.first = (string[0] if string else 0) # <<<<<<<<<<<<<< @@ -2574,14 +3076,14 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_t_2 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); if (__pyx_t_2) { - __pyx_t_3 = __Pyx_GetItemInt_Unicode(__pyx_v_string, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_3 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_3 = __Pyx_GetItemInt_Unicode(__pyx_v_string, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_3 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 178; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __pyx_t_1 = __pyx_t_3; } else { __pyx_t_1 = 0; } __pyx_v_word->first = ((Py_UNICODE)__pyx_t_1); - /* "spacy/en.pyx":137 + /* "spacy/en.pyx":179 * * word.first = (string[0] if string else 0) * word.sic = hashed # <<<<<<<<<<<<<< @@ -2590,7 +3092,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->sic = __pyx_v_hashed; - /* "spacy/en.pyx":141 + /* "spacy/en.pyx":183 * cdef unicode tail_string * cdef unicode lex * if split != 0 and split < length: # <<<<<<<<<<<<<< @@ -2606,26 +3108,26 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } if (__pyx_t_5) { - /* "spacy/en.pyx":142 + /* "spacy/en.pyx":184 * cdef unicode lex * if split != 0 and split < length: * lex = _substr(string, 0, split, length) # <<<<<<<<<<<<<< * tail_string = _substr(string, split, length, length) * else: */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, 0, __pyx_v_split, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, 0, __pyx_v_split, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 184; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_lex = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":143 + /* "spacy/en.pyx":185 * if split != 0 and split < length: * lex = _substr(string, 0, split, length) * tail_string = _substr(string, split, length, length) # <<<<<<<<<<<<<< * else: * lex = string */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_split, __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 143; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_split, __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 185; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_tail_string = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; @@ -2633,7 +3135,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } /*else*/ { - /* "spacy/en.pyx":145 + /* "spacy/en.pyx":187 * tail_string = _substr(string, split, length, length) * else: * lex = string # <<<<<<<<<<<<<< @@ -2643,7 +3145,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __Pyx_INCREF(__pyx_v_string); __pyx_v_lex = __pyx_v_string; - /* "spacy/en.pyx":146 + /* "spacy/en.pyx":188 * else: * lex = string * tail_string = '' # <<<<<<<<<<<<<< @@ -2655,7 +3157,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } __pyx_L3:; - /* "spacy/en.pyx":147 + /* "spacy/en.pyx":189 * lex = string * tail_string = '' * assert lex # <<<<<<<<<<<<<< @@ -2667,36 +3169,36 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_lex != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_lex) != 0); if (unlikely(!__pyx_t_5)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 147; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 189; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":148 + /* "spacy/en.pyx":190 * tail_string = '' * assert lex * cdef unicode normed = normalize_word_string(lex) # <<<<<<<<<<<<<< * cdef unicode last3 = _substr(string, length - 3, length, length) * */ - __pyx_t_6 = __pyx_f_5spacy_2en_normalize_word_string(__pyx_v_lex); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 148; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en_normalize_word_string(__pyx_v_lex); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 190; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_normed = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":149 + /* "spacy/en.pyx":191 * assert lex * cdef unicode normed = normalize_word_string(lex) * cdef unicode last3 = _substr(string, length - 3, length, length) # <<<<<<<<<<<<<< * * assert normed */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, (__pyx_v_length - 3), __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 149; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, (__pyx_v_length - 3), __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 191; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_last3 = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":151 + /* "spacy/en.pyx":193 * cdef unicode last3 = _substr(string, length - 3, length, length) * * assert normed # <<<<<<<<<<<<<< @@ -2708,12 +3210,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_normed != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_normed) != 0); if (unlikely(!__pyx_t_5)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 151; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 193; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":152 + /* "spacy/en.pyx":194 * * assert normed * assert len(normed) # <<<<<<<<<<<<<< @@ -2724,17 +3226,17 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO if (unlikely(!Py_OptimizeFlag)) { if (unlikely(__pyx_v_normed == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 194; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 194; __pyx_clineno = __LINE__; goto __pyx_L1_error;} if (unlikely(!(__pyx_t_7 != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 194; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":154 + /* "spacy/en.pyx":196 * assert len(normed) * * word.lex = hash_string(lex, len(lex)) # <<<<<<<<<<<<<< @@ -2743,13 +3245,13 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_lex == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 196; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_lex); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_lex, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_lex); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 196; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_lex, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 196; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->lex = __pyx_t_8; - /* "spacy/en.pyx":155 + /* "spacy/en.pyx":197 * * word.lex = hash_string(lex, len(lex)) * word.normed = hash_string(normed, len(normed)) # <<<<<<<<<<<<<< @@ -2758,13 +3260,13 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_normed == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 197; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_normed, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 197; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_normed, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 197; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->normed = __pyx_t_8; - /* "spacy/en.pyx":156 + /* "spacy/en.pyx":198 * word.lex = hash_string(lex, len(lex)) * word.normed = hash_string(normed, len(normed)) * word.last3 = hash_string(last3, len(last3)) # <<<<<<<<<<<<<< @@ -2773,49 +3275,49 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_last3 == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 198; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_last3); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_last3, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_last3); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 198; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_last3, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 198; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->last3 = __pyx_t_8; - /* "spacy/en.pyx":158 + /* "spacy/en.pyx":200 * word.last3 = hash_string(last3, len(last3)) * * STRINGS[word.lex] = lex # <<<<<<<<<<<<<< * STRINGS[word.normed] = normed * STRINGS[word.last3] = last3 */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->lex, __pyx_v_lex, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->lex, __pyx_v_lex, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 200; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":159 + /* "spacy/en.pyx":201 * * STRINGS[word.lex] = lex * STRINGS[word.normed] = normed # <<<<<<<<<<<<<< * STRINGS[word.last3] = last3 * */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->normed, __pyx_v_normed, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->normed, __pyx_v_normed, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 201; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":160 + /* "spacy/en.pyx":202 * STRINGS[word.lex] = lex * STRINGS[word.normed] = normed * STRINGS[word.last3] = last3 # <<<<<<<<<<<<<< * * # These are loaded later */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 160; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 202; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->last3, __pyx_v_last3, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 160; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->last3, __pyx_v_last3, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 202; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":163 + /* "spacy/en.pyx":205 * * # These are loaded later * word.prob = 0 # <<<<<<<<<<<<<< @@ -2824,7 +3326,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->prob = 0.0; - /* "spacy/en.pyx":164 + /* "spacy/en.pyx":206 * # These are loaded later * word.prob = 0 * word.cluster = 0 # <<<<<<<<<<<<<< @@ -2833,7 +3335,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->cluster = 0; - /* "spacy/en.pyx":165 + /* "spacy/en.pyx":207 * word.prob = 0 * word.cluster = 0 * word.oft_upper = False # <<<<<<<<<<<<<< @@ -2842,7 +3344,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->oft_upper = 0; - /* "spacy/en.pyx":166 + /* "spacy/en.pyx":208 * word.cluster = 0 * word.oft_upper = False * word.oft_title = False # <<<<<<<<<<<<<< @@ -2851,7 +3353,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->oft_title = 0; - /* "spacy/en.pyx":169 + /* "spacy/en.pyx":211 * * # Now recurse, and deal with the tail * if tail_string: # <<<<<<<<<<<<<< @@ -2861,20 +3363,20 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_tail_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_tail_string) != 0); if (__pyx_t_5) { - /* "spacy/en.pyx":170 + /* "spacy/en.pyx":212 * # Now recurse, and deal with the tail * if tail_string: * word.tail = lookup(tail_string) # <<<<<<<<<<<<<< * return word * */ - __pyx_t_9 = __pyx_f_5spacy_2en_lookup(__pyx_v_tail_string, 0); if (unlikely(__pyx_t_9 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_9 = __pyx_f_5spacy_2en_lookup(__pyx_v_tail_string, 0); if (unlikely(__pyx_t_9 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 212; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->tail = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_t_9); goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":171 + /* "spacy/en.pyx":213 * if tail_string: * word.tail = lookup(tail_string) * return word # <<<<<<<<<<<<<< @@ -2884,7 +3386,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_r = __pyx_v_word; goto __pyx_L0; - /* "spacy/en.pyx":131 + /* "spacy/en.pyx":173 * * * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< @@ -2906,7 +3408,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO return __pyx_r; } -/* "spacy/en.pyx":174 +/* "spacy/en.pyx":216 * * * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< @@ -2926,7 +3428,7 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_find_split", 0); - /* "spacy/en.pyx":175 + /* "spacy/en.pyx":217 * * cdef size_t _find_split(unicode word, size_t length): * cdef int i = 0 # <<<<<<<<<<<<<< @@ -2935,7 +3437,7 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py */ __pyx_v_i = 0; - /* "spacy/en.pyx":177 + /* "spacy/en.pyx":219 * cdef int i = 0 * # Contractions * if word.endswith("'s"): # <<<<<<<<<<<<<< @@ -2944,12 +3446,12 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py */ if (unlikely(__pyx_v_word == Py_None)) { PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "endswith"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 177; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_1 = __Pyx_PyUnicode_Tailmatch(__pyx_v_word, __pyx_kp_u_s, 0, PY_SSIZE_T_MAX, 1); if (unlikely(__pyx_t_1 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 177; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyUnicode_Tailmatch(__pyx_v_word, __pyx_kp_u_s, 0, PY_SSIZE_T_MAX, 1); if (unlikely(__pyx_t_1 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;} if ((__pyx_t_1 != 0)) { - /* "spacy/en.pyx":178 + /* "spacy/en.pyx":220 * # Contractions * if word.endswith("'s"): * return length - 2 # <<<<<<<<<<<<<< @@ -2960,85 +3462,79 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py goto __pyx_L0; } - /* "spacy/en.pyx":180 + /* "spacy/en.pyx":222 * return length - 2 * # Leading punctuation * if is_punct(word, 0, length): # <<<<<<<<<<<<<< * return 1 - * elif length >= 1 and is_punct(word, length - 1, length): + * elif length >= 1: */ __pyx_t_1 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, 0, __pyx_v_length) != 0); if (__pyx_t_1) { - /* "spacy/en.pyx":181 + /* "spacy/en.pyx":223 * # Leading punctuation * if is_punct(word, 0, length): * return 1 # <<<<<<<<<<<<<< - * elif length >= 1 and is_punct(word, length - 1, length): + * elif length >= 1: * # Split off all trailing punctuation characters */ __pyx_r = 1; goto __pyx_L0; } - /* "spacy/en.pyx":182 + /* "spacy/en.pyx":224 * if is_punct(word, 0, length): * return 1 - * elif length >= 1 and is_punct(word, length - 1, length): # <<<<<<<<<<<<<< + * elif length >= 1: # <<<<<<<<<<<<<< * # Split off all trailing punctuation characters - * i = length - 1 + * i = 0 */ __pyx_t_1 = ((__pyx_v_length >= 1) != 0); if (__pyx_t_1) { - __pyx_t_2 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, (__pyx_v_length - 1), __pyx_v_length) != 0); - __pyx_t_3 = __pyx_t_2; - } else { - __pyx_t_3 = __pyx_t_1; - } - if (__pyx_t_3) { - /* "spacy/en.pyx":184 - * elif length >= 1 and is_punct(word, length - 1, length): + /* "spacy/en.pyx":226 + * elif length >= 1: * # Split off all trailing punctuation characters - * i = length - 1 # <<<<<<<<<<<<<< - * while i >= 2 and is_punct(word, i-1, length): - * i -= 1 + * i = 0 # <<<<<<<<<<<<<< + * while i < length and not is_punct(word, i, length): + * i += 1 */ - __pyx_v_i = (__pyx_v_length - 1); + __pyx_v_i = 0; - /* "spacy/en.pyx":185 + /* "spacy/en.pyx":227 * # Split off all trailing punctuation characters - * i = length - 1 - * while i >= 2 and is_punct(word, i-1, length): # <<<<<<<<<<<<<< - * i -= 1 + * i = 0 + * while i < length and not is_punct(word, i, length): # <<<<<<<<<<<<<< + * i += 1 * return i */ while (1) { - __pyx_t_3 = ((__pyx_v_i >= 2) != 0); - if (__pyx_t_3) { - __pyx_t_1 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, (__pyx_v_i - 1), __pyx_v_length) != 0); - __pyx_t_2 = __pyx_t_1; + __pyx_t_1 = ((__pyx_v_i < __pyx_v_length) != 0); + if (__pyx_t_1) { + __pyx_t_2 = ((!(__pyx_f_5spacy_2en_is_punct(__pyx_v_word, __pyx_v_i, __pyx_v_length) != 0)) != 0); + __pyx_t_3 = __pyx_t_2; } else { - __pyx_t_2 = __pyx_t_3; + __pyx_t_3 = __pyx_t_1; } - if (!__pyx_t_2) break; + if (!__pyx_t_3) break; - /* "spacy/en.pyx":186 - * i = length - 1 - * while i >= 2 and is_punct(word, i-1, length): - * i -= 1 # <<<<<<<<<<<<<< + /* "spacy/en.pyx":228 + * i = 0 + * while i < length and not is_punct(word, i, length): + * i += 1 # <<<<<<<<<<<<<< * return i * */ - __pyx_v_i = (__pyx_v_i - 1); + __pyx_v_i = (__pyx_v_i + 1); } goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":187 - * while i >= 2 and is_punct(word, i-1, length): - * i -= 1 + /* "spacy/en.pyx":229 + * while i < length and not is_punct(word, i, length): + * i += 1 * return i # <<<<<<<<<<<<<< * * @@ -3046,7 +3542,7 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py __pyx_r = __pyx_v_i; goto __pyx_L0; - /* "spacy/en.pyx":174 + /* "spacy/en.pyx":216 * * * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< @@ -3063,38 +3559,151 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py return __pyx_r; } -/* "spacy/en.pyx":190 +/* "spacy/en.pyx":232 * * * cdef bint is_punct(unicode word, size_t i, size_t length): # <<<<<<<<<<<<<< - * return not word[i].isalnum() + * # Don't count appostrophes as punct if the next char is a letter + * if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): */ -static int __pyx_f_5spacy_2en_is_punct(PyObject *__pyx_v_word, size_t __pyx_v_i, CYTHON_UNUSED size_t __pyx_v_length) { +static int __pyx_f_5spacy_2en_is_punct(PyObject *__pyx_v_word, size_t __pyx_v_i, size_t __pyx_v_length) { int __pyx_r; __Pyx_RefNannyDeclarations Py_UCS4 __pyx_t_1; int __pyx_t_2; + int __pyx_t_3; + size_t __pyx_t_4; + int __pyx_t_5; + int __pyx_t_6; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; __Pyx_RefNannySetupContext("is_punct", 0); - /* "spacy/en.pyx":191 - * + /* "spacy/en.pyx":234 * cdef bint is_punct(unicode word, size_t i, size_t length): + * # Don't count appostrophes as punct if the next char is a letter + * if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): # <<<<<<<<<<<<<< + * return False + * # Don't count commas as punct if the next char is a number + */ + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 234; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = (__pyx_t_1 == 39); + if (__pyx_t_2) { + __pyx_t_3 = (__pyx_v_i < (__pyx_v_length - 1)); + if (__pyx_t_3) { + __pyx_t_4 = (__pyx_v_i + 1); + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_t_4, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 234; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_5 = Py_UNICODE_ISALPHA(__pyx_t_1); + __pyx_t_6 = (__pyx_t_5 != 0); + } else { + __pyx_t_6 = __pyx_t_3; + } + __pyx_t_3 = __pyx_t_6; + } else { + __pyx_t_3 = __pyx_t_2; + } + if (__pyx_t_3) { + + /* "spacy/en.pyx":235 + * # Don't count appostrophes as punct if the next char is a letter + * if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): + * return False # <<<<<<<<<<<<<< + * # Don't count commas as punct if the next char is a number + * if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): + */ + __pyx_r = 0; + goto __pyx_L0; + } + + /* "spacy/en.pyx":237 + * return False + * # Don't count commas as punct if the next char is a number + * if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): # <<<<<<<<<<<<<< + * return False + * # Don't count periods as punct if the next char is a number + */ + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_3 = (__pyx_t_1 == 44); + if (__pyx_t_3) { + __pyx_t_2 = (__pyx_v_i < (__pyx_v_length - 1)); + if (__pyx_t_2) { + __pyx_t_4 = (__pyx_v_i + 1); + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_t_4, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 237; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_6 = Py_UNICODE_ISDIGIT(__pyx_t_1); + __pyx_t_5 = (__pyx_t_6 != 0); + } else { + __pyx_t_5 = __pyx_t_2; + } + __pyx_t_2 = __pyx_t_5; + } else { + __pyx_t_2 = __pyx_t_3; + } + if (__pyx_t_2) { + + /* "spacy/en.pyx":238 + * # Don't count commas as punct if the next char is a number + * if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): + * return False # <<<<<<<<<<<<<< + * # Don't count periods as punct if the next char is a number + * if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): + */ + __pyx_r = 0; + goto __pyx_L0; + } + + /* "spacy/en.pyx":240 + * return False + * # Don't count periods as punct if the next char is a number + * if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): # <<<<<<<<<<<<<< + * return False + * return not word[i].isalnum() + */ + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 240; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = (__pyx_t_1 == 46); + if (__pyx_t_2) { + __pyx_t_3 = (__pyx_v_i < (__pyx_v_length - 1)); + if (__pyx_t_3) { + __pyx_t_4 = (__pyx_v_i + 1); + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_t_4, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 240; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_5 = Py_UNICODE_ISDIGIT(__pyx_t_1); + __pyx_t_6 = (__pyx_t_5 != 0); + } else { + __pyx_t_6 = __pyx_t_3; + } + __pyx_t_3 = __pyx_t_6; + } else { + __pyx_t_3 = __pyx_t_2; + } + if (__pyx_t_3) { + + /* "spacy/en.pyx":241 + * # Don't count periods as punct if the next char is a number + * if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): + * return False # <<<<<<<<<<<<<< + * return not word[i].isalnum() + */ + __pyx_r = 0; + goto __pyx_L0; + } + + /* "spacy/en.pyx":242 + * if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): + * return False * return not word[i].isalnum() # <<<<<<<<<<<<<< */ - __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 191; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); - __pyx_r = (!(__pyx_t_2 != 0)); + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 242; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_3 = Py_UNICODE_ISALNUM(__pyx_t_1); + __pyx_r = (!(__pyx_t_3 != 0)); goto __pyx_L0; - /* "spacy/en.pyx":190 + /* "spacy/en.pyx":232 * * * cdef bint is_punct(unicode word, size_t i, size_t length): # <<<<<<<<<<<<<< - * return not word[i].isalnum() + * # Don't count appostrophes as punct if the next char is a letter + * if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): */ /* function exit code */ @@ -3106,11 +3715,75 @@ static int __pyx_f_5spacy_2en_is_punct(PyObject *__pyx_v_word, size_t __pyx_v_i, return __pyx_r; } +/* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr") + * cdef object __pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ + +static PyObject *__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(const std::vector<__pyx_t_5spacy_2en_Lexeme_addr> &__pyx_v_v) { + size_t __pyx_v_i; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + size_t __pyx_t_2; + size_t __pyx_t_3; + PyObject *__pyx_t_4 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr", 0); + + /* "vector.to_py":64 + * @cname("__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr") + * cdef object __pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(vector[X]& v): + * return [X_to_py(v[i]) for i in range(v.size())] # <<<<<<<<<<<<<< + * + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = __pyx_v_v.size(); + for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { + __pyx_v_i = __pyx_t_3; + __pyx_t_4 = __Pyx_PyInt_FromSize_t((__pyx_v_v[__pyx_v_i])); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_4); + if (unlikely(__Pyx_ListComp_Append(__pyx_t_1, (PyObject*)__pyx_t_4))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + } + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr") + * cdef object __pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_AddTraceback("vector.to_py.__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + static PyMethodDef __pyx_methods[] = { - {__Pyx_NAMESTR("lookup"), (PyCFunction)__pyx_pw_5spacy_2en_3lookup, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_2lookup)}, - {__Pyx_NAMESTR("lookup_chunk"), (PyCFunction)__pyx_pw_5spacy_2en_5lookup_chunk, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_4lookup_chunk)}, - {__Pyx_NAMESTR("unhash"), (PyCFunction)__pyx_pw_5spacy_2en_7unhash, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_6unhash)}, - {__Pyx_NAMESTR("_substr"), (PyCFunction)__pyx_pw_5spacy_2en_9_substr, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}, + {__Pyx_NAMESTR("tokenize"), (PyCFunction)__pyx_pw_5spacy_2en_3tokenize, METH_O, __Pyx_DOCSTR(0)}, + {__Pyx_NAMESTR("lookup"), (PyCFunction)__pyx_pw_5spacy_2en_5lookup, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_4lookup)}, + {__Pyx_NAMESTR("lookup_chunk"), (PyCFunction)__pyx_pw_5spacy_2en_7lookup_chunk, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_6lookup_chunk)}, + {__Pyx_NAMESTR("unhash"), (PyCFunction)__pyx_pw_5spacy_2en_9unhash, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_8unhash)}, + {__Pyx_NAMESTR("_substr"), (PyCFunction)__pyx_pw_5spacy_2en_11_substr, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}, {0, 0, 0, 0} }; @@ -3162,6 +3835,7 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_oft_upper, __pyx_k_oft_upper, sizeof(__pyx_k_oft_upper), 0, 0, 1, 1}, {&__pyx_n_s_prob, __pyx_k_prob, sizeof(__pyx_k_prob), 0, 0, 1, 1}, {&__pyx_n_s_pyx_capi, __pyx_k_pyx_capi, sizeof(__pyx_k_pyx_capi), 0, 0, 1, 1}, + {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1}, {&__pyx_n_s_read_tokenization, __pyx_k_read_tokenization, sizeof(__pyx_k_read_tokenization), 0, 0, 1, 1}, {&__pyx_kp_u_s, __pyx_k_s, sizeof(__pyx_k_s), 0, 1, 0, 0}, {&__pyx_kp_u_s_d_s, __pyx_k_s_d_s, sizeof(__pyx_k_s_d_s), 0, 1, 0, 0}, @@ -3179,8 +3853,9 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {0, 0, 0, 0, 0, 0, 0} }; static int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} return 0; __pyx_L1_error:; return -1; @@ -3190,26 +3865,26 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); - /* "spacy/en.pyx":24 + /* "spacy/en.pyx":25 * * * def load_tokenization(token_rules): # <<<<<<<<<<<<<< * cdef Lexeme* word * cdef StringHash hashed */ - __pyx_tuple__2 = PyTuple_Pack(9, __pyx_n_s_token_rules, __pyx_n_s_word, __pyx_n_s_hashed, __pyx_n_s_chunk, __pyx_n_s_lex, __pyx_n_s_tokens, __pyx_n_s_i, __pyx_n_s_token_string, __pyx_n_s_length); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_tuple__2 = PyTuple_Pack(9, __pyx_n_s_token_rules, __pyx_n_s_word, __pyx_n_s_hashed, __pyx_n_s_chunk, __pyx_n_s_lex, __pyx_n_s_tokens, __pyx_n_s_i, __pyx_n_s_token_string, __pyx_n_s_length); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_tuple__2); __Pyx_GIVEREF(__pyx_tuple__2); - __pyx_codeobj__3 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__2, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_matt_repos_spaCy_spacy_en, __pyx_n_s_load_tokenization, 24, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_codeobj__3 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__2, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_matt_repos_spaCy_spacy_en, __pyx_n_s_load_tokenization, 25, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - /* "spacy/en.pyx":39 + /* "spacy/en.pyx":40 * * * load_tokenization(util.read_tokenization('en')) # <<<<<<<<<<<<<< * - * cpdef Lexeme_addr lookup(unicode string) except 0: + * */ - __pyx_tuple__4 = PyTuple_Pack(1, __pyx_n_u_en); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_tuple__4 = PyTuple_Pack(1, __pyx_n_u_en); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); __Pyx_RefNannyFinishContext(); @@ -3321,40 +3996,40 @@ PyMODINIT_FUNC PyInit_en(void) /*--- Function import code ---*/ /*--- Execution code ---*/ - /* "spacy/en.pyx":13 + /* "spacy/en.pyx":14 * from ext.murmurhash cimport MurmurHash64A * from ext.murmurhash cimport MurmurHash64B * from . import util # <<<<<<<<<<<<<< * * */ - __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_util); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_util); __Pyx_GIVEREF(__pyx_n_s_util); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_Import(__pyx_n_s_, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_util, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_d, __pyx_n_s_util, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "spacy/en.pyx":16 + /* "spacy/en.pyx":17 * * * STRINGS = {} # <<<<<<<<<<<<<< * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() * LEXEMES.set_empty_key(0) */ - __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_STRINGS, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_d, __pyx_n_s_STRINGS, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "spacy/en.pyx":17 + /* "spacy/en.pyx":18 * * STRINGS = {} * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() # <<<<<<<<<<<<<< @@ -3363,7 +4038,7 @@ PyMODINIT_FUNC PyInit_en(void) */ __pyx_v_5spacy_2en_LEXEMES = google::dense_hash_map<__pyx_t_5spacy_6lexeme_StringHash,__pyx_t_5spacy_2en_Lexeme_ptr>(); - /* "spacy/en.pyx":18 + /* "spacy/en.pyx":19 * STRINGS = {} * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() * LEXEMES.set_empty_key(0) # <<<<<<<<<<<<<< @@ -3372,7 +4047,7 @@ PyMODINIT_FUNC PyInit_en(void) */ __pyx_v_5spacy_2en_LEXEMES.set_empty_key(0); - /* "spacy/en.pyx":21 + /* "spacy/en.pyx":22 * * * cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) # <<<<<<<<<<<<<< @@ -3391,41 +4066,41 @@ PyMODINIT_FUNC PyInit_en(void) __pyx_t_3.tail = NULL; __pyx_v_5spacy_2en_BLANK_WORD = __pyx_t_3; - /* "spacy/en.pyx":24 + /* "spacy/en.pyx":25 * * * def load_tokenization(token_rules): # <<<<<<<<<<<<<< * cdef Lexeme* word * cdef StringHash hashed */ - __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_5spacy_2en_1load_tokenization, NULL, __pyx_n_s_spacy_en); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_5spacy_2en_1load_tokenization, NULL, __pyx_n_s_spacy_en); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_load_tokenization, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (PyDict_SetItem(__pyx_d, __pyx_n_s_load_tokenization, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "spacy/en.pyx":39 + /* "spacy/en.pyx":40 * * * load_tokenization(util.read_tokenization('en')) # <<<<<<<<<<<<<< * - * cpdef Lexeme_addr lookup(unicode string) except 0: + * */ - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_load_tokenization); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_load_tokenization); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_read_tokenization); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_read_tokenization); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -3440,6 +4115,14 @@ PyMODINIT_FUNC PyInit_en(void) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "vector.to_py":63 + * + * @cname("__pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr") + * cdef object __pyx_convert_vector_to_py___pyx_t_5spacy_2en_Lexeme_addr(vector[X]& v): # <<<<<<<<<<<<<< + * return [X_to_py(v[i]) for i in range(v.size())] + * + */ goto __pyx_L0; __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); @@ -3546,6 +4229,32 @@ static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected) { return 0; } +static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) { + PyErr_Format(PyExc_TypeError, + "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", + name, type->tp_name, Py_TYPE(obj)->tp_name); +} +static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, + const char *name, int exact) +{ + if (unlikely(!type)) { + PyErr_SetString(PyExc_SystemError, "Missing type object"); + return 0; + } + if (none_allowed && obj == Py_None) return 1; + else if (exact) { + if (likely(Py_TYPE(obj) == type)) return 1; + #if PY_MAJOR_VERSION == 2 + else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; + #endif + } + else { + if (likely(PyObject_TypeCheck(obj, type))) return 1; + } + __Pyx_RaiseArgumentTypeInvalid(name, obj, type); + return 0; +} + static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) { #if CYTHON_COMPILING_IN_PYPY return PyObject_RichCompareBool(s1, s2, equals); @@ -3668,32 +4377,6 @@ return_ne: #endif } -static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) { - PyErr_Format(PyExc_TypeError, - "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", - name, type->tp_name, Py_TYPE(obj)->tp_name); -} -static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed, - const char *name, int exact) -{ - if (unlikely(!type)) { - PyErr_SetString(PyExc_SystemError, "Missing type object"); - return 0; - } - if (none_allowed && obj == Py_None) return 1; - else if (exact) { - if (likely(Py_TYPE(obj) == type)) return 1; - #if PY_MAJOR_VERSION == 2 - else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; - #endif - } - else { - if (likely(PyObject_TypeCheck(obj, type))) return 1; - } - __Pyx_RaiseArgumentTypeInvalid(name, obj, type); - return 0; -} - static void __Pyx_RaiseArgtupleInvalid( const char* func_name, int exact, diff --git a/spacy/en.pyx b/spacy/en.pyx index 1fc2f7102..ce1607ff7 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -6,6 +6,7 @@ from __future__ import unicode_literals from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t +from libcpp.vector cimport vector from spacy.lexeme cimport Lexeme from ext.murmurhash cimport MurmurHash64A @@ -38,6 +39,47 @@ def load_tokenization(token_rules): load_tokenization(util.read_tokenization('en')) + +cpdef vector[Lexeme_addr] tokenize(unicode string) except *: + cdef size_t length = len(string) + cdef Py_UNICODE* characters = string + + cdef size_t i + cdef Py_UNICODE c + + cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() + cdef unicode current = u'' + cdef Lexeme* token + for i in range(length): + c = characters[i] + if is_whitespace(c): + if current: + token = lookup(current) + while token != NULL: + tokens.push_back(token) + token = token.tail + current = u'' + else: + current += c + if current: + token = lookup(current) + while token != NULL: + tokens.push_back(token) + token = token.tail + return tokens + +cdef inline bint is_whitespace(Py_UNICODE c): + # TODO: Support other unicode spaces + # https://www.cs.tut.fi/~jkorpela/chars/spaces.html + if c == u' ': + return True + elif c == u'\n': + return True + elif c == u'\t': + return True + else: + return False + cpdef Lexeme_addr lookup(unicode string) except 0: '''.. function:: enumerate(sequence[, start=0]) Fetch a Lexeme representing a word string. If the word has not been seen, @@ -179,13 +221,22 @@ cdef size_t _find_split(unicode word, size_t length): # Leading punctuation if is_punct(word, 0, length): return 1 - elif length >= 1 and is_punct(word, length - 1, length): + elif length >= 1: # Split off all trailing punctuation characters - i = length - 1 - while i >= 2 and is_punct(word, i-1, length): - i -= 1 + i = 0 + while i < length and not is_punct(word, i, length): + i += 1 return i cdef bint is_punct(unicode word, size_t i, size_t length): + # Don't count appostrophes as punct if the next char is a letter + if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): + return False + # Don't count commas as punct if the next char is a number + if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): + return False + # Don't count periods as punct if the next char is a number + if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): + return False return not word[i].isalnum() diff --git a/spacy/lexeme.cpp b/spacy/lexeme.cpp index 1d8806510..615cfce18 100644 --- a/spacy/lexeme.cpp +++ b/spacy/lexeme.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 01:14:44 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS diff --git a/spacy/spacy.cpp b/spacy/spacy.cpp index 1f91607e5..f617c4ab9 100644 --- a/spacy/spacy.cpp +++ b/spacy/spacy.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 01:14:44 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS @@ -935,113 +935,6 @@ static PyObject *__pyx_pf_5spacy_5spacy_expand_chunk(CYTHON_UNUSED PyObject *__p return __pyx_r; } -/* "spacy/spacy.pyx":62 - * - * - * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< - * # TODO: Support other unicode spaces - * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html - */ - -static CYTHON_INLINE int __pyx_f_5spacy_5spacy_is_whitespace(Py_UNICODE __pyx_v_c) { - int __pyx_r; - __Pyx_RefNannyDeclarations - __Pyx_RefNannySetupContext("is_whitespace", 0); - - /* "spacy/spacy.pyx":69 - * elif c == u'\n': - * return True - * elif c == u'\t': # <<<<<<<<<<<<<< - * return True - * else: - */ - switch (__pyx_v_c) { - - /* "spacy/spacy.pyx":65 - * # TODO: Support other unicode spaces - * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html - * if c == u' ': # <<<<<<<<<<<<<< - * return True - * elif c == u'\n': - */ - case 32: - - /* "spacy/spacy.pyx":66 - * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html - * if c == u' ': - * return True # <<<<<<<<<<<<<< - * elif c == u'\n': - * return True - */ - __pyx_r = 1; - goto __pyx_L0; - break; - - /* "spacy/spacy.pyx":67 - * if c == u' ': - * return True - * elif c == u'\n': # <<<<<<<<<<<<<< - * return True - * elif c == u'\t': - */ - case 10: - - /* "spacy/spacy.pyx":68 - * return True - * elif c == u'\n': - * return True # <<<<<<<<<<<<<< - * elif c == u'\t': - * return True - */ - __pyx_r = 1; - goto __pyx_L0; - break; - - /* "spacy/spacy.pyx":69 - * elif c == u'\n': - * return True - * elif c == u'\t': # <<<<<<<<<<<<<< - * return True - * else: - */ - case 9: - - /* "spacy/spacy.pyx":70 - * return True - * elif c == u'\t': - * return True # <<<<<<<<<<<<<< - * else: - * return False - */ - __pyx_r = 1; - goto __pyx_L0; - break; - default: - - /* "spacy/spacy.pyx":72 - * return True - * else: - * return False # <<<<<<<<<<<<<< - */ - __pyx_r = 0; - goto __pyx_L0; - break; - } - - /* "spacy/spacy.pyx":62 - * - * - * cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<< - * # TODO: Support other unicode spaces - * # https://www.cs.tut.fi/~jkorpela/chars/spaces.html - */ - - /* function exit code */ - __pyx_L0:; - __Pyx_RefNannyFinishContext(); - return __pyx_r; -} - /* "vector.to_py":63 * * @cname("__pyx_convert_vector_to_py_size_t") diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 43b376767..876cfd841 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -11,62 +11,3 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *: return tokens -""" -cpdef vector[size_t] ids_from_text(unicode text) except *: - cdef size_t length = len(text) - cdef Py_UNICODE* characters = text - - cdef size_t i - cdef Py_UNICODE c - - cdef vector[size_t] tokens = vector[size_t]() - cdef unicode current = u'' - cdef Lexeme* token - cdef int alnum_end = -1 - cdef size_t alnum_start = 0 - cdef bint seen_alnum = False - for i in range(length): - c = characters[i] - if is_whitespace(c): - token = lookup(current) - tokens.push_back(token) - clitic = 0 - while token.clitics[clitic]: - tokens.push_back(token.clitics[clitic]) - clitic += 1 - current = u'' - alnum_start = 0 - alnum_end = -1 - seen_alnum = False - else: - if not seen_alnum and c.isalnum(): - alnum_start = i - seen_alnum = True - elif seen_alnum and alnum_end == -1 and not c.isalnum(): - alnum_end = i - current += c - if current: - token = lookup(current) - tokens.push_back(token) - clitic = 0 - while token.clitics[clitic]: - tokens.push_back(token.clitics[clitic]) - clitic += 1 - return tokens -""" - -#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *: -# pass - - -cdef inline bint is_whitespace(Py_UNICODE c): - # TODO: Support other unicode spaces - # https://www.cs.tut.fi/~jkorpela/chars/spaces.html - if c == u' ': - return True - elif c == u'\n': - return True - elif c == u'\t': - return True - else: - return False diff --git a/tests/.test_tokenizer.py.swo b/tests/.test_tokenizer.py.swo index 7b7be95fa2b7341b375187f22e711ec8fe865b21..c549d7a9fd5297f1af12b19fc16a6abbd7914ea6 100644 GIT binary patch delta 546 zcmZwCJxE(o6bJD020^R&5P2qPL;IALdlOA!HAM|%C`du^W2n|a#6+aF6Xk@l*!O01!Waw> z5SE*0=(P4)|NOK0JVKO9DpmRZbbsW~rA4ak?pxQVgBGH#2aPBa^>ywd`U5ri2HWrv z)?gJD;U##W6Lk3JBszl{Y(f^6!Gc)`fg3IzM0>CfuN*W>;QuW?pgy4WqAXi>hA zUjF;+F_)!-uY}&zowrptw~IlZ>Urq0^QxF!QkI>Uff#=CTRieb?|m;U0{m9`{r7i! zK0Fky2l};Xc8V_T1seN!R5-o3NGUUID6^y%QdY)NrADtr*u!6XGg?1ii&%a4^Hj)- xYEFLDqSYHYqsdYsx%457+02}2Sh<2-RCkjSrHgjXXar2I1oh^-I?uIW>_5(jh{^x} delta 344 zcma*hJxBrp7{>7@aH!1AXfSB#f}#RZAtSMmsU=94qBSk7!R*2=N*9kBo5Hz4Xb6HJ z4uYUL=worprKKfXa}N98;_L%Ic!%eO$H_Pub33-Vnlk4jvm!yMGWW3lbaLCA3UB-yUQu)F=1bq$zGdZ) j54Bsg3Z`x4a|b)6axT)js?X76^>;et{!L!_x+(ty%j-|! diff --git a/tests/sun.tokens b/tests/sun.tokens new file mode 100644 index 000000000..d16fa1eae --- /dev/null +++ b/tests/sun.tokens @@ -0,0 +1,4 @@ +The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] + +The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] +Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] diff --git a/tests/sun.txt b/tests/sun.txt new file mode 100644 index 000000000..1b04f8bc9 --- /dev/null +++ b/tests/sun.txt @@ -0,0 +1,4 @@ +The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15] + +The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19] +Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of −1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25] diff --git a/tests/tokenizer.sed b/tests/tokenizer.sed new file mode 100644 index 000000000..f5f891c49 --- /dev/null +++ b/tests/tokenizer.sed @@ -0,0 +1,82 @@ +#!/bin/sed -f + +# Sed script to produce Penn Treebank tokenization on arbitrary raw text. +# Yeah, sure. + +# expected input: raw text with ONE SENTENCE TOKEN PER LINE + +# by Robert MacIntyre, University of Pennsylvania, late 1995. + +# If this wasn't such a trivial program, I'd include all that stuff about +# no warrantee, free use, etc. from the GNU General Public License. If you +# want to be picky, assume that all of its terms apply. Okay? + +# attempt to get correct directional quotes +s=^"=`` =g +s=\([ ([{<]\)"=\1 `` =g +# close quotes handled at end + +s=\.\.\.= ... =g +s=[,;:@#$%&]= & =g + +# Assume sentence tokenization has been done first, so split FINAL periods +# only. +s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g +# however, we may as well split ALL question marks and exclamation points, +# since they shouldn't have the abbrev.-marker ambiguity problem +s=[?!]= & =g + +# parentheses, brackets, etc. +s=[][(){}<>]= & =g +# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file +# version of these symbols. +# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. +# s/(/-LRB-/g +# s/)/-RRB-/g +# s/\[/-LSB-/g +# s/\]/-RSB-/g +# s/{/-LCB-/g +# s/}/-RCB-/g + +s=--= -- =g + +# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since +# you might someday want to know how the words originally fit together -- +# but it's too late to make a better system now, given the millions of +# words we've already done "wrong". + +# First off, add a space to the beginning and end of each line, to reduce +# necessary number of regexps. +s=$= = +s=^= = + +s="= '' =g +# possessive or close-single-quote +s=\([^']\)' =\1 ' =g +# as in it's, I'm, we'd +s='\([sSmMdD]\) = '\1 =g +s='ll = 'll =g +s='re = 're =g +s='ve = 've =g +s=n't = n't =g +s='LL = 'LL =g +s='RE = 'RE =g +s='VE = 'VE =g +s=N'T = N'T =g + +s= \([Cc]\)annot = \1an not =g +s= \([Dd]\)'ye = \1' ye =g +s= \([Gg]\)imme = \1im me =g +s= \([Gg]\)onna = \1on na =g +s= \([Gg]\)otta = \1ot ta =g +s= \([Ll]\)emme = \1em me =g +s= \([Mm]\)ore'n = \1ore 'n =g +s= '\([Tt]\)is = '\1 is =g +s= '\([Tt]\)was = '\1 was =g +s= \([Ww]\)anna = \1an na =g +# s= \([Ww]\)haddya = \1ha dd ya =g +# s= \([Ww]\)hatcha = \1ha t cha =g + +# clean out extra spaces +s= *= =g +s=^ *==g