* Add German tokenizer files

This commit is contained in:
Matthew Honnibal 2014-09-25 18:29:13 +02:00
parent 2e44fa7179
commit 93505276ed
2 changed files with 168 additions and 0 deletions

42
spacy/de.pxd Normal file
View File

@ -0,0 +1,42 @@
from spacy.spacy cimport Language
from spacy.word cimport Lexeme
cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef English EN
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)

126
spacy/de.pyx Normal file
View File

@ -0,0 +1,126 @@
# cython: profile=True
# cython: embedsignature=True
'''Tokenize German text, using a scheme based on the Negra corpus.
Tokenization is generally similar to English text, and the same set of orthographic
flags are used.
An abbreviation list is used to handle common abbreviations. Hyphenated words
are not split, following the Treebank usage.
'''
from __future__ import unicode_literals
from libc.stdint cimport uint64_t
cimport spacy
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
from spacy.common cimport check_punct
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[ASCIIFIED] = get_asciified(string)
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
views[SHAPE], lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class German(spacy.Language):
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string)
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0
if word.startswith("'s") or word.startswith("'S"):
return 2
# Contractions
if word.endswith("'s") and length >= 3:
return length - 2
# Leading punctuation
if check_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not check_punct(word, i, length):
i += 1
return i
DE = German('de')
lookup = DE.lookup
tokenize = DE.tokenize
load_clusters = DE.load_clusters
load_unigram_probs = DE.load_unigram_probs
load_case_stats = DE.load_case_stats
load_tag_stats = DE.load_tag_stats