* Upd from spacy

This commit is contained in:
Matthew Honnibal 2014-07-23 17:35:18 +01:00
parent 87bf205b82
commit a895fe5ddb
5 changed files with 51 additions and 17 deletions

View File

@ -1,8 +1,17 @@
from .lexeme import lex_of
from .lexeme import sic_of
from .tokens import Tokens
__all__ = [lex_of, sic_of]
# Don't know how to get the enum Python visible :(
SIC = 0
LEX = 1
NORM = 2
SHAPE = 3
LAST3 = 4
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
"""

View File

@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
# ...Unless we're at 0
return i == 0
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
return False
# Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False
# Don't count periods as punct if the next char is a number
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
# Don't count periods as punct if the next char is not whitespace
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
return False
return not word[i].isalnum()

View File

@ -16,12 +16,12 @@ from . import util
from os import path
cimport cython
def get_normalized(unicode lex, size_t length):
return lex.lower()
#if lex.isdigit():
# return '!YEAR' if length == 4 else '!DIGIT'
#else:
# return lex.lower()
if lex.isalpha() and lex.islower():
return lex
else:
return get_word_shape(lex, length)
def get_word_shape(lex, length):
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
return 0
cdef class Language:
def __cinit__(self, name):
self.name = name

View File

@ -14,5 +14,5 @@ cdef class Tokens:
cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1
cpdef list group_by(self, StringAttr attr)
cpdef object group_by(self, StringAttr attr)
cpdef dict count_by(self, StringAttr attr)

View File

@ -37,21 +37,45 @@ cdef class Tokens:
for el in other:
self.append(el)
cpdef list group_by(self, StringAttr attr):
cpdef object group_by(self, StringAttr attr):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
(string names, hashes, tokens)
The lists are aligned, so the ith entry in string names is the string
that the ith entry in hashes unhashes to, which the Tokens instance
is grouped by.
You can then use count_by or group_by on the Tokens
for further processing. Calling group_by and then asking the length
of the Tokens objects is equivalent to count_by, but somewhat slower.
'''
# Implementation here is working around some of the constraints in
# Cython about what type of thing can go in what type of container.
# Long story short, it's pretty hard to get a Python object like
# Tokens into a vector or array. If we really need this to run faster,
# we can be tricky and get the Python list access out of the loop. What
# we'd do is store pointers to the underlying vectors.
# So far, speed isn't mattering here.
cdef dict indices = {}
cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
cdef list groups = []
cdef list names = []
cdef list hashes = []
cdef StringHash key
cdef Lexeme_addr t
for t in self.vctr[0]:
key = attr_of(t, attr)
if key in indices:
groups[indices[key]].push_back(t)
groups[indices[key]].append(t)
else:
indices[key] = groups.size()
groups.push_back(vector[Lexeme_addr]())
groups.back().push_back(t)
return groups
indices[key] = len(groups)
groups.append(Tokens(self.lang))
names.append(self.lang.unhash(key))
hashes.append(key)
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, StringAttr attr):
counts = {}