* Working tokenization. en doesn't match PTB perfectly. Need to reorganize before adding more schemes.

2025-10-26 13:41:21 +03:00 · 2014-07-07 01:15:59 +02:00 · 2014-07-07 01:15:59 +02:00 · a62c38e1ef
commit a62c38e1ef
parent 4e79446dc2
9 changed files with 1199 additions and 541 deletions
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -6,6 +6,7 @@ from __future__ import unicode_literals

 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector

 from spacy.lexeme cimport Lexeme
 from ext.murmurhash cimport MurmurHash64A
@ -38,6 +39,47 @@ def load_tokenization(token_rules):

 load_tokenization(util.read_tokenization('en'))

+
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
+    cdef size_t length = len(string)
+    cdef Py_UNICODE* characters = <Py_UNICODE*>string
+
+    cdef size_t i
+    cdef Py_UNICODE c
+
+    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
+    cdef unicode current = u''
+    cdef Lexeme* token
+    for i in range(length):
+        c = characters[i]
+        if is_whitespace(c):
+            if current:
+                token = <Lexeme*>lookup(current)
+                while token != NULL:
+                    tokens.push_back(<Lexeme_addr>token)
+                    token = token.tail
+            current = u''
+        else:
+            current += c
+    if current:
+        token = <Lexeme*>lookup(current)
+        while token != NULL:
+            tokens.push_back(<Lexeme_addr>token)
+            token = token.tail
+    return tokens
+
+cdef inline bint is_whitespace(Py_UNICODE c):
+    # TODO: Support other unicode spaces
+    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+    if c == u' ':
+        return True
+    elif c == u'\n':
+        return True
+    elif c == u'\t':
+        return True
+    else:
+        return False
+
 cpdef Lexeme_addr lookup(unicode string) except 0:
    '''.. function:: enumerate(sequence[, start=0])
    Fetch a Lexeme representing a word string. If the word has not been seen,
@ -179,13 +221,22 @@ cdef size_t _find_split(unicode word, size_t length):
    # Leading punctuation
    if is_punct(word, 0, length):
        return 1
-    elif length >= 1 and is_punct(word, length - 1, length):
+    elif length >= 1:
        # Split off all trailing punctuation characters
-        i = length - 1
-        while i >= 2 and is_punct(word, i-1, length):
-            i -= 1
+        i = 0
+        while i < length and not is_punct(word, i, length):
+            i += 1
    return i


 cdef bint is_punct(unicode word, size_t i, size_t length):
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        return False
+    # Don't count commas as punct if the next char is a number
+    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
+        return False
+    # Don't count periods as punct if the next char is a number
+    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+        return False
    return not word[i].isalnum()
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 01:14:44 2014 */

 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 01:14:44 2014 */

 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
@ -935,113 +935,6 @@ static PyObject *__pyx_pf_5spacy_5spacy_expand_chunk(CYTHON_UNUSED PyObject *__p
  return __pyx_r;
 }

-/* "spacy/spacy.pyx":62
- * 
- * 
- * cdef inline bint is_whitespace(Py_UNICODE c):             # <<<<<<<<<<<<<<
- *     # TODO: Support other unicode spaces
- *     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
- */
-
-static CYTHON_INLINE int __pyx_f_5spacy_5spacy_is_whitespace(Py_UNICODE __pyx_v_c) {
-  int __pyx_r;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("is_whitespace", 0);
-
-  /* "spacy/spacy.pyx":69
- *     elif c == u'\n':
- *         return True
- *     elif c == u'\t':             # <<<<<<<<<<<<<<
- *         return True
- *     else:
- */
-  switch (__pyx_v_c) {
-
-    /* "spacy/spacy.pyx":65
- *     # TODO: Support other unicode spaces
- *     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
- *     if c == u' ':             # <<<<<<<<<<<<<<
- *         return True
- *     elif c == u'\n':
- */
-    case 32:
-
-    /* "spacy/spacy.pyx":66
- *     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
- *     if c == u' ':
- *         return True             # <<<<<<<<<<<<<<
- *     elif c == u'\n':
- *         return True
- */
-    __pyx_r = 1;
-    goto __pyx_L0;
-    break;
-
-    /* "spacy/spacy.pyx":67
- *     if c == u' ':
- *         return True
- *     elif c == u'\n':             # <<<<<<<<<<<<<<
- *         return True
- *     elif c == u'\t':
- */
-    case 10:
-
-    /* "spacy/spacy.pyx":68
- *         return True
- *     elif c == u'\n':
- *         return True             # <<<<<<<<<<<<<<
- *     elif c == u'\t':
- *         return True
- */
-    __pyx_r = 1;
-    goto __pyx_L0;
-    break;
-
-    /* "spacy/spacy.pyx":69
- *     elif c == u'\n':
- *         return True
- *     elif c == u'\t':             # <<<<<<<<<<<<<<
- *         return True
- *     else:
- */
-    case 9:
-
-    /* "spacy/spacy.pyx":70
- *         return True
- *     elif c == u'\t':
- *         return True             # <<<<<<<<<<<<<<
- *     else:
- *         return False
- */
-    __pyx_r = 1;
-    goto __pyx_L0;
-    break;
-    default:
-
-    /* "spacy/spacy.pyx":72
- *         return True
- *     else:
- *         return False             # <<<<<<<<<<<<<<
- */
-    __pyx_r = 0;
-    goto __pyx_L0;
-    break;
-  }
-
-  /* "spacy/spacy.pyx":62
- * 
- * 
- * cdef inline bint is_whitespace(Py_UNICODE c):             # <<<<<<<<<<<<<<
- *     # TODO: Support other unicode spaces
- *     # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
- */
-
-  /* function exit code */
-  __pyx_L0:;
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
 /* "vector.to_py":63
 * 
 * @cname("__pyx_convert_vector_to_py_size_t")
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -11,62 +11,3 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
    return tokens


-"""
-cpdef vector[size_t] ids_from_text(unicode text) except *:
-    cdef size_t length = len(text)
-    cdef Py_UNICODE* characters = <Py_UNICODE*>text
-
-    cdef size_t i
-    cdef Py_UNICODE c
-
-    cdef vector[size_t] tokens = vector[size_t]()
-    cdef unicode current = u''
-    cdef Lexeme* token
-    cdef int alnum_end = -1
-    cdef size_t alnum_start = 0
-    cdef bint seen_alnum = False
-    for i in range(length):
-        c = characters[i]
-        if is_whitespace(c):
-            token = <Lexeme*>lookup(current)
-            tokens.push_back(<size_t>token)
-            clitic = 0
-            while token.clitics[clitic]:
-                tokens.push_back(token.clitics[clitic])
-                clitic += 1
-            current = u''
-            alnum_start = 0
-            alnum_end = -1
-            seen_alnum = False
-        else:
-            if not seen_alnum and c.isalnum():
-                alnum_start = i
-                seen_alnum = True
-            elif seen_alnum and alnum_end == -1 and not c.isalnum():
-                alnum_end = i
-            current += c
-    if current:
-        token = <Lexeme*>lookup(current)
-        tokens.push_back(<size_t>token)
-        clitic = 0
-        while token.clitics[clitic]:
-            tokens.push_back(token.clitics[clitic])
-            clitic += 1
-    return tokens
-"""
-
-#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
-#    pass
-
-
-cdef inline bint is_whitespace(Py_UNICODE c):
-    # TODO: Support other unicode spaces
-    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
-    if c == u' ':
-        return True
-    elif c == u'\n':
-        return True
-    elif c == u'\t':
-        return True
-    else:
-        return False
--- a/tests/.test_tokenizer.py.swo
+++ b/tests/.test_tokenizer.py.swo
--- a/tests/sun.tokens
+++ b/tests/sun.tokens
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] 
+
+The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] 
+Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] 
--- a/tests/sun.txt
+++ b/tests/sun.txt
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15]
+
+The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19]
+Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of −1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25]
--- a/tests/tokenizer.sed
+++ b/tests/tokenizer.sed
@ -0,0 +1,82 @@
+#!/bin/sed -f
+
+# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
+# Yeah, sure.
+
+# expected input: raw text with ONE SENTENCE TOKEN PER LINE
+
+# by Robert MacIntyre, University of Pennsylvania, late 1995.
+
+# If this wasn't such a trivial program, I'd include all that stuff about
+# no warrantee, free use, etc. from the GNU General Public License.  If you
+# want to be picky, assume that all of its terms apply.  Okay?
+
+# attempt to get correct directional quotes
+s=^"=`` =g
+s=\([ ([{<]\)"=\1 `` =g
+# close quotes handled at end
+
+s=\.\.\.= ... =g
+s=[,;:@#$%&]= & =g
+
+# Assume sentence tokenization has been done first, so split FINAL periods
+# only. 
+s=\([^.]\)\([.]\)\([])}>"']*\)[ 	]*$=\1 \2\3 =g
+# however, we may as well split ALL question marks and exclamation points,
+# since they shouldn't have the abbrev.-marker ambiguity problem
+s=[?!]= & =g
+
+# parentheses, brackets, etc.
+s=[][(){}<>]= & =g
+# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
+# version of these symbols.
+# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
+# s/(/-LRB-/g
+# s/)/-RRB-/g
+# s/\[/-LSB-/g
+# s/\]/-RSB-/g
+# s/{/-LCB-/g
+# s/}/-RCB-/g
+
+s=--= -- =g
+
+# NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since
+# you might someday want to know how the words originally fit together --
+# but it's too late to make a better system now, given the millions of
+# words we've already done "wrong".
+
+# First off, add a space to the beginning and end of each line, to reduce
+# necessary number of regexps.
+s=$= =
+s=^= =
+
+s="= '' =g
+# possessive or close-single-quote
+s=\([^']\)' =\1 ' =g
+# as in it's, I'm, we'd
+s='\([sSmMdD]\) = '\1 =g
+s='ll = 'll =g
+s='re = 're =g
+s='ve = 've =g
+s=n't = n't =g
+s='LL = 'LL =g
+s='RE = 'RE =g
+s='VE = 'VE =g
+s=N'T = N'T =g
+
+s= \([Cc]\)annot = \1an not =g
+s= \([Dd]\)'ye = \1' ye =g
+s= \([Gg]\)imme = \1im me =g
+s= \([Gg]\)onna = \1on na =g
+s= \([Gg]\)otta = \1ot ta =g
+s= \([Ll]\)emme = \1em me =g
+s= \([Mm]\)ore'n = \1ore 'n =g
+s= '\([Tt]\)is = '\1 is =g
+s= '\([Tt]\)was = '\1 was =g
+s= \([Ww]\)anna = \1an na =g
+# s= \([Ww]\)haddya = \1ha dd ya =g
+# s= \([Ww]\)hatcha = \1ha t cha =g
+
+# clean out extra spaces
+s=  *= =g
+s=^ *==g