* Working tokenization. en doesn't match PTB perfectly. Need to reorganize before adding more schemes.

This commit is contained in:
Matthew Honnibal 2014-07-07 01:15:59 +02:00
parent 4e79446dc2
commit a62c38e1ef
9 changed files with 1199 additions and 541 deletions

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
@ -38,6 +39,47 @@ def load_tokenization(token_rules):
load_tokenization(util.read_tokenization('en'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef unicode current = u''
cdef Lexeme* token
for i in range(length):
c = characters[i]
if is_whitespace(c):
if current:
token = <Lexeme*>lookup(current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
current = u''
else:
current += c
if current:
token = <Lexeme*>lookup(current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
return tokens
cdef inline bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False
cpdef Lexeme_addr lookup(unicode string) except 0:
'''.. function:: enumerate(sequence[, start=0])
Fetch a Lexeme representing a word string. If the word has not been seen,
@ -179,13 +221,22 @@ cdef size_t _find_split(unicode word, size_t length):
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1 and is_punct(word, length - 1, length):
elif length >= 1:
# Split off all trailing punctuation characters
i = length - 1
while i >= 2 and is_punct(word, i-1, length):
i -= 1
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return False
# Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False
# Don't count periods as punct if the next char is a number
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
return False
return not word[i].isalnum()

View File

@ -1,4 +1,4 @@
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
/* Generated by Cython 0.20.1 on Mon Jul 7 01:14:44 2014 */
#define PY_SSIZE_T_CLEAN
#ifndef CYTHON_USE_PYLONG_INTERNALS

View File

@ -1,4 +1,4 @@
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
/* Generated by Cython 0.20.1 on Mon Jul 7 01:14:44 2014 */
#define PY_SSIZE_T_CLEAN
#ifndef CYTHON_USE_PYLONG_INTERNALS
@ -935,113 +935,6 @@ static PyObject *__pyx_pf_5spacy_5spacy_expand_chunk(CYTHON_UNUSED PyObject *__p
return __pyx_r;
}
/* "spacy/spacy.pyx":62
*
*
* cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<<
* # TODO: Support other unicode spaces
* # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
*/
static CYTHON_INLINE int __pyx_f_5spacy_5spacy_is_whitespace(Py_UNICODE __pyx_v_c) {
int __pyx_r;
__Pyx_RefNannyDeclarations
__Pyx_RefNannySetupContext("is_whitespace", 0);
/* "spacy/spacy.pyx":69
* elif c == u'\n':
* return True
* elif c == u'\t': # <<<<<<<<<<<<<<
* return True
* else:
*/
switch (__pyx_v_c) {
/* "spacy/spacy.pyx":65
* # TODO: Support other unicode spaces
* # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
* if c == u' ': # <<<<<<<<<<<<<<
* return True
* elif c == u'\n':
*/
case 32:
/* "spacy/spacy.pyx":66
* # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
* if c == u' ':
* return True # <<<<<<<<<<<<<<
* elif c == u'\n':
* return True
*/
__pyx_r = 1;
goto __pyx_L0;
break;
/* "spacy/spacy.pyx":67
* if c == u' ':
* return True
* elif c == u'\n': # <<<<<<<<<<<<<<
* return True
* elif c == u'\t':
*/
case 10:
/* "spacy/spacy.pyx":68
* return True
* elif c == u'\n':
* return True # <<<<<<<<<<<<<<
* elif c == u'\t':
* return True
*/
__pyx_r = 1;
goto __pyx_L0;
break;
/* "spacy/spacy.pyx":69
* elif c == u'\n':
* return True
* elif c == u'\t': # <<<<<<<<<<<<<<
* return True
* else:
*/
case 9:
/* "spacy/spacy.pyx":70
* return True
* elif c == u'\t':
* return True # <<<<<<<<<<<<<<
* else:
* return False
*/
__pyx_r = 1;
goto __pyx_L0;
break;
default:
/* "spacy/spacy.pyx":72
* return True
* else:
* return False # <<<<<<<<<<<<<<
*/
__pyx_r = 0;
goto __pyx_L0;
break;
}
/* "spacy/spacy.pyx":62
*
*
* cdef inline bint is_whitespace(Py_UNICODE c): # <<<<<<<<<<<<<<
* # TODO: Support other unicode spaces
* # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
*/
/* function exit code */
__pyx_L0:;
__Pyx_RefNannyFinishContext();
return __pyx_r;
}
/* "vector.to_py":63
*
* @cname("__pyx_convert_vector_to_py_size_t")

View File

@ -11,62 +11,3 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
return tokens
"""
cpdef vector[size_t] ids_from_text(unicode text) except *:
cdef size_t length = len(text)
cdef Py_UNICODE* characters = <Py_UNICODE*>text
cdef size_t i
cdef Py_UNICODE c
cdef vector[size_t] tokens = vector[size_t]()
cdef unicode current = u''
cdef Lexeme* token
cdef int alnum_end = -1
cdef size_t alnum_start = 0
cdef bint seen_alnum = False
for i in range(length):
c = characters[i]
if is_whitespace(c):
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
current = u''
alnum_start = 0
alnum_end = -1
seen_alnum = False
else:
if not seen_alnum and c.isalnum():
alnum_start = i
seen_alnum = True
elif seen_alnum and alnum_end == -1 and not c.isalnum():
alnum_end = i
current += c
if current:
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
return tokens
"""
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
# pass
cdef inline bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False

Binary file not shown.

4
tests/sun.tokens Normal file
View File

@ -0,0 +1,4 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of 1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]

4
tests/sun.txt Normal file
View File

@ -0,0 +1,4 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15]
The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19]
Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of 1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25]

82
tests/tokenizer.sed Normal file
View File

@ -0,0 +1,82 @@
#!/bin/sed -f
# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
# Yeah, sure.
# expected input: raw text with ONE SENTENCE TOKEN PER LINE
# by Robert MacIntyre, University of Pennsylvania, late 1995.
# If this wasn't such a trivial program, I'd include all that stuff about
# no warrantee, free use, etc. from the GNU General Public License. If you
# want to be picky, assume that all of its terms apply. Okay?
# attempt to get correct directional quotes
s=^"=`` =g
s=\([ ([{<]\)"=\1 `` =g
# close quotes handled at end
s=\.\.\.= ... =g
s=[,;:@#$%&]= & =g
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
s=[?!]= & =g
# parentheses, brackets, etc.
s=[][(){}<>]= & =g
# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
# version of these symbols.
# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
# s/(/-LRB-/g
# s/)/-RRB-/g
# s/\[/-LSB-/g
# s/\]/-RSB-/g
# s/{/-LCB-/g
# s/}/-RCB-/g
s=--= -- =g
# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since
# you might someday want to know how the words originally fit together --
# but it's too late to make a better system now, given the millions of
# words we've already done "wrong".
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
s=$= =
s=^= =
s="= '' =g
# possessive or close-single-quote
s=\([^']\)' =\1 ' =g
# as in it's, I'm, we'd
s='\([sSmMdD]\) = '\1 =g
s='ll = 'll =g
s='re = 're =g
s='ve = 've =g
s=n't = n't =g
s='LL = 'LL =g
s='RE = 'RE =g
s='VE = 'VE =g
s=N'T = N'T =g
s= \([Cc]\)annot = \1an not =g
s= \([Dd]\)'ye = \1' ye =g
s= \([Gg]\)imme = \1im me =g
s= \([Gg]\)onna = \1on na =g
s= \([Gg]\)otta = \1ot ta =g
s= \([Ll]\)emme = \1em me =g
s= \([Mm]\)ore'n = \1ore 'n =g
s= '\([Tt]\)is = '\1 is =g
s= '\([Tt]\)was = '\1 was =g
s= \([Ww]\)anna = \1an na =g
# s= \([Ww]\)haddya = \1ha dd ya =g
# s= \([Ww]\)hatcha = \1ha t cha =g
# clean out extra spaces
s= *= =g
s=^ *==g