mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-28 12:53:41 +03:00
* Upd from spacy
This commit is contained in:
parent
87bf205b82
commit
a895fe5ddb
|
@ -1,8 +1,17 @@
|
||||||
from .lexeme import lex_of
|
from .lexeme import lex_of
|
||||||
from .lexeme import sic_of
|
from .lexeme import sic_of
|
||||||
|
|
||||||
|
from .tokens import Tokens
|
||||||
|
|
||||||
__all__ = [lex_of, sic_of]
|
# Don't know how to get the enum Python visible :(
|
||||||
|
|
||||||
|
SIC = 0
|
||||||
|
LEX = 1
|
||||||
|
NORM = 2
|
||||||
|
SHAPE = 3
|
||||||
|
LAST3 = 4
|
||||||
|
|
||||||
|
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
# ...Unless we're at 0
|
# ...Unless we're at 0
|
||||||
return i == 0
|
return i == 0
|
||||||
|
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
||||||
|
return False
|
||||||
# Don't count commas as punct if the next char is a number
|
# Don't count commas as punct if the next char is a number
|
||||||
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
||||||
return False
|
return False
|
||||||
# Don't count periods as punct if the next char is a number
|
# Don't count periods as punct if the next char is not whitespace
|
||||||
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
|
||||||
return False
|
return False
|
||||||
return not word[i].isalnum()
|
return not word[i].isalnum()
|
||||||
|
|
||||||
|
|
|
@ -16,12 +16,12 @@ from . import util
|
||||||
from os import path
|
from os import path
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
def get_normalized(unicode lex, size_t length):
|
def get_normalized(unicode lex, size_t length):
|
||||||
return lex.lower()
|
if lex.isalpha() and lex.islower():
|
||||||
#if lex.isdigit():
|
return lex
|
||||||
# return '!YEAR' if length == 4 else '!DIGIT'
|
else:
|
||||||
#else:
|
return get_word_shape(lex, length)
|
||||||
# return lex.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(lex, length):
|
def get_word_shape(lex, length):
|
||||||
|
@ -55,7 +55,6 @@ def set_orth_flags(lex, length):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
|
@ -14,5 +14,5 @@ cdef class Tokens:
|
||||||
cpdef int append(self, Lexeme_addr token)
|
cpdef int append(self, Lexeme_addr token)
|
||||||
cpdef int extend(self, Tokens other) except -1
|
cpdef int extend(self, Tokens other) except -1
|
||||||
|
|
||||||
cpdef list group_by(self, StringAttr attr)
|
cpdef object group_by(self, StringAttr attr)
|
||||||
cpdef dict count_by(self, StringAttr attr)
|
cpdef dict count_by(self, StringAttr attr)
|
||||||
|
|
|
@ -37,21 +37,45 @@ cdef class Tokens:
|
||||||
for el in other:
|
for el in other:
|
||||||
self.append(el)
|
self.append(el)
|
||||||
|
|
||||||
cpdef list group_by(self, StringAttr attr):
|
cpdef object group_by(self, StringAttr attr):
|
||||||
|
'''Group tokens that share the property attr into Tokens instances, and
|
||||||
|
return a list of them. Returns a tuple of three lists:
|
||||||
|
|
||||||
|
(string names, hashes, tokens)
|
||||||
|
|
||||||
|
The lists are aligned, so the ith entry in string names is the string
|
||||||
|
that the ith entry in hashes unhashes to, which the Tokens instance
|
||||||
|
is grouped by.
|
||||||
|
|
||||||
|
You can then use count_by or group_by on the Tokens
|
||||||
|
for further processing. Calling group_by and then asking the length
|
||||||
|
of the Tokens objects is equivalent to count_by, but somewhat slower.
|
||||||
|
'''
|
||||||
|
# Implementation here is working around some of the constraints in
|
||||||
|
# Cython about what type of thing can go in what type of container.
|
||||||
|
# Long story short, it's pretty hard to get a Python object like
|
||||||
|
# Tokens into a vector or array. If we really need this to run faster,
|
||||||
|
# we can be tricky and get the Python list access out of the loop. What
|
||||||
|
# we'd do is store pointers to the underlying vectors.
|
||||||
|
# So far, speed isn't mattering here.
|
||||||
cdef dict indices = {}
|
cdef dict indices = {}
|
||||||
cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
|
cdef list groups = []
|
||||||
|
cdef list names = []
|
||||||
|
cdef list hashes = []
|
||||||
|
|
||||||
cdef StringHash key
|
cdef StringHash key
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
key = attr_of(t, attr)
|
key = attr_of(t, attr)
|
||||||
if key in indices:
|
if key in indices:
|
||||||
groups[indices[key]].push_back(t)
|
groups[indices[key]].append(t)
|
||||||
else:
|
else:
|
||||||
indices[key] = groups.size()
|
indices[key] = len(groups)
|
||||||
groups.push_back(vector[Lexeme_addr]())
|
groups.append(Tokens(self.lang))
|
||||||
groups.back().push_back(t)
|
names.append(self.lang.unhash(key))
|
||||||
return groups
|
hashes.append(key)
|
||||||
|
groups[-1].append(t)
|
||||||
|
return names, hashes, groups
|
||||||
|
|
||||||
cpdef dict count_by(self, StringAttr attr):
|
cpdef dict count_by(self, StringAttr attr):
|
||||||
counts = {}
|
counts = {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user