* Upd from spacy

This commit is contained in:
Matthew Honnibal 2014-07-23 17:35:18 +01:00
parent 87bf205b82
commit a895fe5ddb
5 changed files with 51 additions and 17 deletions

View File

@ -1,8 +1,17 @@
from .lexeme import lex_of from .lexeme import lex_of
from .lexeme import sic_of from .lexeme import sic_of
from .tokens import Tokens
__all__ = [lex_of, sic_of] # Don't know how to get the enum Python visible :(
SIC = 0
LEX = 1
NORM = 2
SHAPE = 3
LAST3 = 4
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
""" """

View File

@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
# ...Unless we're at 0 # ...Unless we're at 0
return i == 0 return i == 0
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
return False
# Don't count commas as punct if the next char is a number # Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False return False
# Don't count periods as punct if the next char is a number # Don't count periods as punct if the next char is not whitespace
if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
return False return False
return not word[i].isalnum() return not word[i].isalnum()

View File

@ -16,12 +16,12 @@ from . import util
from os import path from os import path
cimport cython cimport cython
def get_normalized(unicode lex, size_t length): def get_normalized(unicode lex, size_t length):
return lex.lower() if lex.isalpha() and lex.islower():
#if lex.isdigit(): return lex
# return '!YEAR' if length == 4 else '!DIGIT' else:
#else: return get_word_shape(lex, length)
# return lex.lower()
def get_word_shape(lex, length): def get_word_shape(lex, length):
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
return 0 return 0
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name

View File

@ -14,5 +14,5 @@ cdef class Tokens:
cpdef int append(self, Lexeme_addr token) cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1 cpdef int extend(self, Tokens other) except -1
cpdef list group_by(self, StringAttr attr) cpdef object group_by(self, StringAttr attr)
cpdef dict count_by(self, StringAttr attr) cpdef dict count_by(self, StringAttr attr)

View File

@ -37,21 +37,45 @@ cdef class Tokens:
for el in other: for el in other:
self.append(el) self.append(el)
cpdef list group_by(self, StringAttr attr): cpdef object group_by(self, StringAttr attr):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
(string names, hashes, tokens)
The lists are aligned, so the ith entry in string names is the string
that the ith entry in hashes unhashes to, which the Tokens instance
is grouped by.
You can then use count_by or group_by on the Tokens
for further processing. Calling group_by and then asking the length
of the Tokens objects is equivalent to count_by, but somewhat slower.
'''
# Implementation here is working around some of the constraints in
# Cython about what type of thing can go in what type of container.
# Long story short, it's pretty hard to get a Python object like
# Tokens into a vector or array. If we really need this to run faster,
# we can be tricky and get the Python list access out of the loop. What
# we'd do is store pointers to the underlying vectors.
# So far, speed isn't mattering here.
cdef dict indices = {} cdef dict indices = {}
cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]() cdef list groups = []
cdef list names = []
cdef list hashes = []
cdef StringHash key cdef StringHash key
cdef Lexeme_addr t cdef Lexeme_addr t
for t in self.vctr[0]: for t in self.vctr[0]:
key = attr_of(t, attr) key = attr_of(t, attr)
if key in indices: if key in indices:
groups[indices[key]].push_back(t) groups[indices[key]].append(t)
else: else:
indices[key] = groups.size() indices[key] = len(groups)
groups.push_back(vector[Lexeme_addr]()) groups.append(Tokens(self.lang))
groups.back().push_back(t) names.append(self.lang.unhash(key))
return groups hashes.append(key)
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, StringAttr attr): cpdef dict count_by(self, StringAttr attr):
counts = {} counts = {}