* Add German tokenizer files

2025-11-21 18:26:05 +03:00 · 2014-09-25 18:29:13 +02:00 · 2014-09-25 18:29:13 +02:00 · 93505276ed
commit 93505276ed
parent 2e44fa7179
2 changed files with 168 additions and 0 deletions
--- a/spacy/de.pxd
+++ b/spacy/de.pxd
@ -0,0 +1,42 @@
+from spacy.spacy cimport Language
+from spacy.word cimport Lexeme
+cimport cython
+
+
+cpdef size_t ALPHA
+cpdef size_t DIGIT 
+cpdef size_t PUNCT
+cpdef size_t SPACE
+cpdef size_t LOWER
+cpdef size_t UPPER
+cpdef size_t TITLE
+cpdef size_t ASCII
+
+cpdef size_t OFT_LOWER
+cpdef size_t OFT_TITLE
+cpdef size_t OFT_UPPER
+
+cpdef size_t PUNCT
+cpdef size_t CONJ
+cpdef size_t NUM
+cpdef size_t N
+cpdef size_t DET
+cpdef size_t ADP
+cpdef size_t ADJ
+cpdef size_t ADV
+cpdef size_t VERB
+cpdef size_t NOUN
+cpdef size_t PDT
+cpdef size_t POS
+cpdef size_t PRON
+cpdef size_t PRT
+
+cdef class English(spacy.Language):
+    cdef int find_split(self, unicode word)
+
+
+cdef English EN
+
+
+cpdef Word lookup(unicode word)
+cpdef list tokenize(unicode string)
--- a/spacy/de.pyx
+++ b/spacy/de.pyx
@ -0,0 +1,126 @@
+# cython: profile=True
+# cython: embedsignature=True
+'''Tokenize German text, using a scheme based on the Negra corpus.
+
+Tokenization is generally similar to English text, and the same set of orthographic
+flags are used.
+
+An abbreviation list is used to handle common abbreviations. Hyphenated words
+are not split, following the Treebank usage.
+'''
+from __future__ import unicode_literals
+
+from libc.stdint cimport uint64_t
+
+cimport spacy
+
+from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
+from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
+from spacy.common cimport check_punct
+
+# Python-readable flag constants --- can't read an enum from Python
+
+# Don't want to manually assign these numbers, or we'll insert one and have to
+# change them all.
+# Don't use "i", as we don't want it in the global scope!
+cdef size_t __i = 0
+
+ALPHA = __i; i += 1
+DIGIT = __i; __i += 1
+PUNCT = __i; __i += 1
+SPACE = __i; __i += 1
+LOWER = __i; __i += 1
+UPPER = __i; __i += 1
+TITLE = __i; __i += 1
+ASCII = __i; __i += 1
+
+OFT_LOWER = __i; __i += 1 
+OFT_UPPER = __i; __i += 1
+OFT_TITLE = __i; __i += 1
+
+PUNCT = __i; __i += 1
+CONJ = __i; __i += 1
+NUM = __i; __i += 1
+X = __i; __i += 1
+DET = __i; __i += 1
+ADP = __i; __i += 1
+ADJ = __i; __i += 1
+ADV = __i; __i += 1
+VERB = __i; __i += 1
+NOUN = __i; __i += 1
+PDT = __i; __i += 1
+POS = __i; __i += 1
+PRON = __i; __i += 1
+PRT = __i; __i += 1
+
+
+# These are for the string views
+__i = 0
+SIC = __i; __i += 1
+CANON_CASED = __i; __i += 1
+NON_SPARSE = __i; __i += 1
+SHAPE = __i; __i += 1
+NR_STRING_VIEWS = __i
+
+
+def get_string_views(unicode string, lexeme):
+    views = ['' for _ in range(NR_STRING_VIEWS)]
+    views[SIC] = string
+    views[CANON_CASED] = canonicalize_case(string, lexeme)
+    views[SHAPE] = get_string_shape(string)
+    views[ASCIIFIED] = get_asciified(string)
+    views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
+                                       views[SHAPE], lexeme)
+    return views
+
+
+def set_orth_flags(unicode string, flags_t flags)
+    setters = [
+        (ALPHA, is_alpha),
+        (DIGIT, is_digit),
+        (PUNCT, is_punct),
+        (SPACE, is_space),
+        (LOWER, is_lower),
+        (UPPER, is_upper),
+        (SPACE, is_space)
+    ]
+
+    for bit, setter in setters:
+        if setter(string):
+            flags |= 1 << bit
+    return flags
+
+
+cdef class German(spacy.Language):
+    cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
+                           tag_freqs=None):
+        return Lexeme(s, length, views, prob=prob, cluster=cluster,
+                      flags=self.get_flags(string)
+    
+    cdef int find_split(self, unicode word):
+        cdef size_t length = len(word)
+        cdef int i = 0
+        if word.startswith("'s") or word.startswith("'S"):
+            return 2
+        # Contractions
+        if word.endswith("'s") and length >= 3:
+            return length - 2
+        # Leading punctuation
+        if check_punct(word, 0, length):
+            return 1
+        elif length >= 1:
+            # Split off all trailing punctuation characters
+            i = 0
+            while i < length and not check_punct(word, i, length):
+                i += 1
+        return i
+
+
+DE = German('de')
+
+lookup = DE.lookup
+tokenize = DE.tokenize
+load_clusters = DE.load_clusters
+load_unigram_probs = DE.load_unigram_probs
+load_case_stats = DE.load_case_stats
+load_tag_stats = DE.load_tag_stats