mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Add German tokenizer files
This commit is contained in:
parent
2e44fa7179
commit
93505276ed
42
spacy/de.pxd
Normal file
42
spacy/de.pxd
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
from spacy.spacy cimport Language
|
||||||
|
from spacy.word cimport Lexeme
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
cpdef size_t ALPHA
|
||||||
|
cpdef size_t DIGIT
|
||||||
|
cpdef size_t PUNCT
|
||||||
|
cpdef size_t SPACE
|
||||||
|
cpdef size_t LOWER
|
||||||
|
cpdef size_t UPPER
|
||||||
|
cpdef size_t TITLE
|
||||||
|
cpdef size_t ASCII
|
||||||
|
|
||||||
|
cpdef size_t OFT_LOWER
|
||||||
|
cpdef size_t OFT_TITLE
|
||||||
|
cpdef size_t OFT_UPPER
|
||||||
|
|
||||||
|
cpdef size_t PUNCT
|
||||||
|
cpdef size_t CONJ
|
||||||
|
cpdef size_t NUM
|
||||||
|
cpdef size_t N
|
||||||
|
cpdef size_t DET
|
||||||
|
cpdef size_t ADP
|
||||||
|
cpdef size_t ADJ
|
||||||
|
cpdef size_t ADV
|
||||||
|
cpdef size_t VERB
|
||||||
|
cpdef size_t NOUN
|
||||||
|
cpdef size_t PDT
|
||||||
|
cpdef size_t POS
|
||||||
|
cpdef size_t PRON
|
||||||
|
cpdef size_t PRT
|
||||||
|
|
||||||
|
cdef class English(spacy.Language):
|
||||||
|
cdef int find_split(self, unicode word)
|
||||||
|
|
||||||
|
|
||||||
|
cdef English EN
|
||||||
|
|
||||||
|
|
||||||
|
cpdef Word lookup(unicode word)
|
||||||
|
cpdef list tokenize(unicode string)
|
126
spacy/de.pyx
Normal file
126
spacy/de.pyx
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
# cython: profile=True
|
||||||
|
# cython: embedsignature=True
|
||||||
|
'''Tokenize German text, using a scheme based on the Negra corpus.
|
||||||
|
|
||||||
|
Tokenization is generally similar to English text, and the same set of orthographic
|
||||||
|
flags are used.
|
||||||
|
|
||||||
|
An abbreviation list is used to handle common abbreviations. Hyphenated words
|
||||||
|
are not split, following the Treebank usage.
|
||||||
|
'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
|
cimport spacy
|
||||||
|
|
||||||
|
from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
|
||||||
|
from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
|
||||||
|
from spacy.common cimport check_punct
|
||||||
|
|
||||||
|
# Python-readable flag constants --- can't read an enum from Python
|
||||||
|
|
||||||
|
# Don't want to manually assign these numbers, or we'll insert one and have to
|
||||||
|
# change them all.
|
||||||
|
# Don't use "i", as we don't want it in the global scope!
|
||||||
|
cdef size_t __i = 0
|
||||||
|
|
||||||
|
ALPHA = __i; i += 1
|
||||||
|
DIGIT = __i; __i += 1
|
||||||
|
PUNCT = __i; __i += 1
|
||||||
|
SPACE = __i; __i += 1
|
||||||
|
LOWER = __i; __i += 1
|
||||||
|
UPPER = __i; __i += 1
|
||||||
|
TITLE = __i; __i += 1
|
||||||
|
ASCII = __i; __i += 1
|
||||||
|
|
||||||
|
OFT_LOWER = __i; __i += 1
|
||||||
|
OFT_UPPER = __i; __i += 1
|
||||||
|
OFT_TITLE = __i; __i += 1
|
||||||
|
|
||||||
|
PUNCT = __i; __i += 1
|
||||||
|
CONJ = __i; __i += 1
|
||||||
|
NUM = __i; __i += 1
|
||||||
|
X = __i; __i += 1
|
||||||
|
DET = __i; __i += 1
|
||||||
|
ADP = __i; __i += 1
|
||||||
|
ADJ = __i; __i += 1
|
||||||
|
ADV = __i; __i += 1
|
||||||
|
VERB = __i; __i += 1
|
||||||
|
NOUN = __i; __i += 1
|
||||||
|
PDT = __i; __i += 1
|
||||||
|
POS = __i; __i += 1
|
||||||
|
PRON = __i; __i += 1
|
||||||
|
PRT = __i; __i += 1
|
||||||
|
|
||||||
|
|
||||||
|
# These are for the string views
|
||||||
|
__i = 0
|
||||||
|
SIC = __i; __i += 1
|
||||||
|
CANON_CASED = __i; __i += 1
|
||||||
|
NON_SPARSE = __i; __i += 1
|
||||||
|
SHAPE = __i; __i += 1
|
||||||
|
NR_STRING_VIEWS = __i
|
||||||
|
|
||||||
|
|
||||||
|
def get_string_views(unicode string, lexeme):
|
||||||
|
views = ['' for _ in range(NR_STRING_VIEWS)]
|
||||||
|
views[SIC] = string
|
||||||
|
views[CANON_CASED] = canonicalize_case(string, lexeme)
|
||||||
|
views[SHAPE] = get_string_shape(string)
|
||||||
|
views[ASCIIFIED] = get_asciified(string)
|
||||||
|
views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
|
||||||
|
views[SHAPE], lexeme)
|
||||||
|
return views
|
||||||
|
|
||||||
|
|
||||||
|
def set_orth_flags(unicode string, flags_t flags)
|
||||||
|
setters = [
|
||||||
|
(ALPHA, is_alpha),
|
||||||
|
(DIGIT, is_digit),
|
||||||
|
(PUNCT, is_punct),
|
||||||
|
(SPACE, is_space),
|
||||||
|
(LOWER, is_lower),
|
||||||
|
(UPPER, is_upper),
|
||||||
|
(SPACE, is_space)
|
||||||
|
]
|
||||||
|
|
||||||
|
for bit, setter in setters:
|
||||||
|
if setter(string):
|
||||||
|
flags |= 1 << bit
|
||||||
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
cdef class German(spacy.Language):
|
||||||
|
cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
|
||||||
|
tag_freqs=None):
|
||||||
|
return Lexeme(s, length, views, prob=prob, cluster=cluster,
|
||||||
|
flags=self.get_flags(string)
|
||||||
|
|
||||||
|
cdef int find_split(self, unicode word):
|
||||||
|
cdef size_t length = len(word)
|
||||||
|
cdef int i = 0
|
||||||
|
if word.startswith("'s") or word.startswith("'S"):
|
||||||
|
return 2
|
||||||
|
# Contractions
|
||||||
|
if word.endswith("'s") and length >= 3:
|
||||||
|
return length - 2
|
||||||
|
# Leading punctuation
|
||||||
|
if check_punct(word, 0, length):
|
||||||
|
return 1
|
||||||
|
elif length >= 1:
|
||||||
|
# Split off all trailing punctuation characters
|
||||||
|
i = 0
|
||||||
|
while i < length and not check_punct(word, i, length):
|
||||||
|
i += 1
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
DE = German('de')
|
||||||
|
|
||||||
|
lookup = DE.lookup
|
||||||
|
tokenize = DE.tokenize
|
||||||
|
load_clusters = DE.load_clusters
|
||||||
|
load_unigram_probs = DE.load_unigram_probs
|
||||||
|
load_case_stats = DE.load_case_stats
|
||||||
|
load_tag_stats = DE.load_tag_stats
|
Loading…
Reference in New Issue
Block a user