* Add German tokenizer files

2025-10-19 10:14:24 +03:00 · 2014-09-25 18:29:13 +02:00 · 2014-09-25 18:29:13 +02:00 · 93505276ed
commit 93505276ed
parent 2e44fa7179
2 changed files with 168 additions and 0 deletions
--- a/spacy/de.pxd
+++ b/spacy/de.pxd
@ -0,0 +1,42 @@
 from spacy.spacy cimport Language
 from spacy.word cimport Lexeme
 cimport cython
 cpdef size_t ALPHA
 cpdef size_t DIGIT 
 cpdef size_t PUNCT
 cpdef size_t SPACE
 cpdef size_t LOWER
 cpdef size_t UPPER
 cpdef size_t TITLE
 cpdef size_t ASCII
 cpdef size_t OFT_LOWER
 cpdef size_t OFT_TITLE
 cpdef size_t OFT_UPPER
 cpdef size_t PUNCT
 cpdef size_t CONJ
 cpdef size_t NUM
 cpdef size_t N
 cpdef size_t DET
 cpdef size_t ADP
 cpdef size_t ADJ
 cpdef size_t ADV
 cpdef size_t VERB
 cpdef size_t NOUN
 cpdef size_t PDT
 cpdef size_t POS
 cpdef size_t PRON
 cpdef size_t PRT
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
 cdef English EN
 cpdef Word lookup(unicode word)
 cpdef list tokenize(unicode string)
--- a/spacy/de.pyx
+++ b/spacy/de.pyx
@ -0,0 +1,126 @@
 # cython: profile=True
 # cython: embedsignature=True
 '''Tokenize German text, using a scheme based on the Negra corpus.
 Tokenization is generally similar to English text, and the same set of orthographic
 flags are used.
 An abbreviation list is used to handle common abbreviations. Hyphenated words
 are not split, following the Treebank usage.
 '''
 from __future__ import unicode_literals
 from libc.stdint cimport uint64_t
 cimport spacy
 from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
 from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
 from spacy.common cimport check_punct
 # Python-readable flag constants --- can't read an enum from Python
 # Don't want to manually assign these numbers, or we'll insert one and have to
 # change them all.
 # Don't use "i", as we don't want it in the global scope!
 cdef size_t __i = 0
 ALPHA = __i; i += 1
 DIGIT = __i; __i += 1
 PUNCT = __i; __i += 1
 SPACE = __i; __i += 1
 LOWER = __i; __i += 1
 UPPER = __i; __i += 1
 TITLE = __i; __i += 1
 ASCII = __i; __i += 1
 OFT_LOWER = __i; __i += 1 
 OFT_UPPER = __i; __i += 1
 OFT_TITLE = __i; __i += 1
 PUNCT = __i; __i += 1
 CONJ = __i; __i += 1
 NUM = __i; __i += 1
 X = __i; __i += 1
 DET = __i; __i += 1
 ADP = __i; __i += 1
 ADJ = __i; __i += 1
 ADV = __i; __i += 1
 VERB = __i; __i += 1
 NOUN = __i; __i += 1
 PDT = __i; __i += 1
 POS = __i; __i += 1
 PRON = __i; __i += 1
 PRT = __i; __i += 1
 # These are for the string views
 __i = 0
 SIC = __i; __i += 1
 CANON_CASED = __i; __i += 1
 NON_SPARSE = __i; __i += 1
 SHAPE = __i; __i += 1
 NR_STRING_VIEWS = __i
 def get_string_views(unicode string, lexeme):
    views = ['' for _ in range(NR_STRING_VIEWS)]
    views[SIC] = string
    views[CANON_CASED] = canonicalize_case(string, lexeme)
    views[SHAPE] = get_string_shape(string)
    views[ASCIIFIED] = get_asciified(string)
    views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
                                       views[SHAPE], lexeme)
    return views
 def set_orth_flags(unicode string, flags_t flags)
    setters = [
        (ALPHA, is_alpha),
        (DIGIT, is_digit),
        (PUNCT, is_punct),
        (SPACE, is_space),
        (LOWER, is_lower),
        (UPPER, is_upper),
        (SPACE, is_space)
    ]
    for bit, setter in setters:
        if setter(string):
            flags |= 1 << bit
    return flags
 cdef class German(spacy.Language):
    cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
                           tag_freqs=None):
        return Lexeme(s, length, views, prob=prob, cluster=cluster,
                      flags=self.get_flags(string)
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
        if word.startswith("'s") or word.startswith("'S"):
            return 2
        # Contractions
        if word.endswith("'s") and length >= 3:
            return length - 2
        # Leading punctuation
        if check_punct(word, 0, length):
            return 1
        elif length >= 1:
            # Split off all trailing punctuation characters
            i = 0
            while i < length and not check_punct(word, i, length):
                i += 1
        return i
 DE = German('de')
 lookup = DE.lookup
 tokenize = DE.tokenize
 load_clusters = DE.load_clusters
 load_unigram_probs = DE.load_unigram_probs
 load_case_stats = DE.load_case_stats
 load_tag_stats = DE.load_tag_stats