mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Add German tokenizer files
This commit is contained in:
parent
2e44fa7179
commit
93505276ed
42
spacy/de.pxd
Normal file
42
spacy/de.pxd
Normal file
|
@ -0,0 +1,42 @@
|
|||
from spacy.spacy cimport Language
|
||||
from spacy.word cimport Lexeme
|
||||
cimport cython
|
||||
|
||||
|
||||
cpdef size_t ALPHA
|
||||
cpdef size_t DIGIT
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t SPACE
|
||||
cpdef size_t LOWER
|
||||
cpdef size_t UPPER
|
||||
cpdef size_t TITLE
|
||||
cpdef size_t ASCII
|
||||
|
||||
cpdef size_t OFT_LOWER
|
||||
cpdef size_t OFT_TITLE
|
||||
cpdef size_t OFT_UPPER
|
||||
|
||||
cpdef size_t PUNCT
|
||||
cpdef size_t CONJ
|
||||
cpdef size_t NUM
|
||||
cpdef size_t N
|
||||
cpdef size_t DET
|
||||
cpdef size_t ADP
|
||||
cpdef size_t ADJ
|
||||
cpdef size_t ADV
|
||||
cpdef size_t VERB
|
||||
cpdef size_t NOUN
|
||||
cpdef size_t PDT
|
||||
cpdef size_t POS
|
||||
cpdef size_t PRON
|
||||
cpdef size_t PRT
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
|
||||
|
||||
cdef English EN
|
||||
|
||||
|
||||
cpdef Word lookup(unicode word)
|
||||
cpdef list tokenize(unicode string)
|
126
spacy/de.pyx
Normal file
126
spacy/de.pyx
Normal file
|
@ -0,0 +1,126 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
'''Tokenize German text, using a scheme based on the Negra corpus.
|
||||
|
||||
Tokenization is generally similar to English text, and the same set of orthographic
|
||||
flags are used.
|
||||
|
||||
An abbreviation list is used to handle common abbreviations. Hyphenated words
|
||||
are not split, following the Treebank usage.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
cimport spacy
|
||||
|
||||
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
|
||||
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
|
||||
from spacy.common cimport check_punct
|
||||
|
||||
# Python-readable flag constants --- can't read an enum from Python
|
||||
|
||||
# Don't want to manually assign these numbers, or we'll insert one and have to
|
||||
# change them all.
|
||||
# Don't use "i", as we don't want it in the global scope!
|
||||
cdef size_t __i = 0
|
||||
|
||||
ALPHA = __i; i += 1
|
||||
DIGIT = __i; __i += 1
|
||||
PUNCT = __i; __i += 1
|
||||
SPACE = __i; __i += 1
|
||||
LOWER = __i; __i += 1
|
||||
UPPER = __i; __i += 1
|
||||
TITLE = __i; __i += 1
|
||||
ASCII = __i; __i += 1
|
||||
|
||||
OFT_LOWER = __i; __i += 1
|
||||
OFT_UPPER = __i; __i += 1
|
||||
OFT_TITLE = __i; __i += 1
|
||||
|
||||
PUNCT = __i; __i += 1
|
||||
CONJ = __i; __i += 1
|
||||
NUM = __i; __i += 1
|
||||
X = __i; __i += 1
|
||||
DET = __i; __i += 1
|
||||
ADP = __i; __i += 1
|
||||
ADJ = __i; __i += 1
|
||||
ADV = __i; __i += 1
|
||||
VERB = __i; __i += 1
|
||||
NOUN = __i; __i += 1
|
||||
PDT = __i; __i += 1
|
||||
POS = __i; __i += 1
|
||||
PRON = __i; __i += 1
|
||||
PRT = __i; __i += 1
|
||||
|
||||
|
||||
# These are for the string views
|
||||
__i = 0
|
||||
SIC = __i; __i += 1
|
||||
CANON_CASED = __i; __i += 1
|
||||
NON_SPARSE = __i; __i += 1
|
||||
SHAPE = __i; __i += 1
|
||||
NR_STRING_VIEWS = __i
|
||||
|
||||
|
||||
def get_string_views(unicode string, lexeme):
|
||||
views = ['' for _ in range(NR_STRING_VIEWS)]
|
||||
views[SIC] = string
|
||||
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
||||
views[SHAPE] = get_string_shape(string)
|
||||
views[ASCIIFIED] = get_asciified(string)
|
||||
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
|
||||
views[SHAPE], lexeme)
|
||||
return views
|
||||
|
||||
|
||||
def set_orth_flags(unicode string, flags_t flags)
|
||||
setters = [
|
||||
(ALPHA, is_alpha),
|
||||
(DIGIT, is_digit),
|
||||
(PUNCT, is_punct),
|
||||
(SPACE, is_space),
|
||||
(LOWER, is_lower),
|
||||
(UPPER, is_upper),
|
||||
(SPACE, is_space)
|
||||
]
|
||||
|
||||
for bit, setter in setters:
|
||||
if setter(string):
|
||||
flags |= 1 << bit
|
||||
return flags
|
||||
|
||||
|
||||
cdef class German(spacy.Language):
|
||||
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
|
||||
tag_freqs=None):
|
||||
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
||||
flags=self.get_flags(string)
|
||||
|
||||
cdef int find_split(self, unicode word):
|
||||
cdef size_t length = len(word)
|
||||
cdef int i = 0
|
||||
if word.startswith("'s") or word.startswith("'S"):
|
||||
return 2
|
||||
# Contractions
|
||||
if word.endswith("'s") and length >= 3:
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if check_punct(word, 0, length):
|
||||
return 1
|
||||
elif length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not check_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
DE = German('de')
|
||||
|
||||
lookup = DE.lookup
|
||||
tokenize = DE.tokenize
|
||||
load_clusters = DE.load_clusters
|
||||
load_unigram_probs = DE.load_unigram_probs
|
||||
load_case_stats = DE.load_case_stats
|
||||
load_tag_stats = DE.load_tag_stats
|
Loading…
Reference in New Issue
Block a user