mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* Upd from spacy
This commit is contained in:
parent
87bf205b82
commit
a895fe5ddb
|
@ -1,8 +1,17 @@
|
|||
from .lexeme import lex_of
|
||||
from .lexeme import sic_of
|
||||
|
||||
from .tokens import Tokens
|
||||
|
||||
__all__ = [lex_of, sic_of]
|
||||
# Don't know how to get the enum Python visible :(
|
||||
|
||||
SIC = 0
|
||||
LEX = 1
|
||||
NORM = 2
|
||||
SHAPE = 3
|
||||
LAST3 = 4
|
||||
|
||||
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
|
||||
|
||||
|
||||
"""
|
||||
|
|
|
@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
|
|||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||
# ...Unless we're at 0
|
||||
return i == 0
|
||||
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
||||
return False
|
||||
# Don't count commas as punct if the next char is a number
|
||||
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
||||
return False
|
||||
# Don't count periods as punct if the next char is a number
|
||||
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
||||
# Don't count periods as punct if the next char is not whitespace
|
||||
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
|
||||
return False
|
||||
return not word[i].isalnum()
|
||||
|
||||
|
|
|
@ -16,12 +16,12 @@ from . import util
|
|||
from os import path
|
||||
cimport cython
|
||||
|
||||
|
||||
def get_normalized(unicode lex, size_t length):
|
||||
return lex.lower()
|
||||
#if lex.isdigit():
|
||||
# return '!YEAR' if length == 4 else '!DIGIT'
|
||||
#else:
|
||||
# return lex.lower()
|
||||
if lex.isalpha() and lex.islower():
|
||||
return lex
|
||||
else:
|
||||
return get_word_shape(lex, length)
|
||||
|
||||
|
||||
def get_word_shape(lex, length):
|
||||
|
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
|
|||
return 0
|
||||
|
||||
|
||||
|
||||
cdef class Language:
|
||||
def __cinit__(self, name):
|
||||
self.name = name
|
||||
|
|
|
@ -14,5 +14,5 @@ cdef class Tokens:
|
|||
cpdef int append(self, Lexeme_addr token)
|
||||
cpdef int extend(self, Tokens other) except -1
|
||||
|
||||
cpdef list group_by(self, StringAttr attr)
|
||||
cpdef object group_by(self, StringAttr attr)
|
||||
cpdef dict count_by(self, StringAttr attr)
|
||||
|
|
|
@ -37,21 +37,45 @@ cdef class Tokens:
|
|||
for el in other:
|
||||
self.append(el)
|
||||
|
||||
cpdef list group_by(self, StringAttr attr):
|
||||
cpdef object group_by(self, StringAttr attr):
|
||||
'''Group tokens that share the property attr into Tokens instances, and
|
||||
return a list of them. Returns a tuple of three lists:
|
||||
|
||||
(string names, hashes, tokens)
|
||||
|
||||
The lists are aligned, so the ith entry in string names is the string
|
||||
that the ith entry in hashes unhashes to, which the Tokens instance
|
||||
is grouped by.
|
||||
|
||||
You can then use count_by or group_by on the Tokens
|
||||
for further processing. Calling group_by and then asking the length
|
||||
of the Tokens objects is equivalent to count_by, but somewhat slower.
|
||||
'''
|
||||
# Implementation here is working around some of the constraints in
|
||||
# Cython about what type of thing can go in what type of container.
|
||||
# Long story short, it's pretty hard to get a Python object like
|
||||
# Tokens into a vector or array. If we really need this to run faster,
|
||||
# we can be tricky and get the Python list access out of the loop. What
|
||||
# we'd do is store pointers to the underlying vectors.
|
||||
# So far, speed isn't mattering here.
|
||||
cdef dict indices = {}
|
||||
cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
|
||||
cdef list groups = []
|
||||
cdef list names = []
|
||||
cdef list hashes = []
|
||||
|
||||
cdef StringHash key
|
||||
cdef Lexeme_addr t
|
||||
for t in self.vctr[0]:
|
||||
key = attr_of(t, attr)
|
||||
if key in indices:
|
||||
groups[indices[key]].push_back(t)
|
||||
groups[indices[key]].append(t)
|
||||
else:
|
||||
indices[key] = groups.size()
|
||||
groups.push_back(vector[Lexeme_addr]())
|
||||
groups.back().push_back(t)
|
||||
return groups
|
||||
indices[key] = len(groups)
|
||||
groups.append(Tokens(self.lang))
|
||||
names.append(self.lang.unhash(key))
|
||||
hashes.append(key)
|
||||
groups[-1].append(t)
|
||||
return names, hashes, groups
|
||||
|
||||
cpdef dict count_by(self, StringAttr attr):
|
||||
counts = {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user