Reformat language data

This commit is contained in:
Ines Montani 2016-11-24 13:51:32 +01:00
parent 5ad5408242
commit e0712d1b32
2 changed files with 117 additions and 41 deletions

View File

@ -82,7 +82,7 @@ zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
""".split())
TOKENIZER_PREFIXES = map(re.escape, r'''
TOKENIZER_PREFIXES = r'''
,
"
(
@ -106,11 +106,11 @@ a-
....
...
»
_
§
'''.strip().split('\n'))
'''.strip().split('\n')
TOKENIZER_SUFFIXES = r'''
@ -141,6 +141,7 @@ _
°
\.\.
\.\.\.
\.\.\.\.
@ -191,7 +192,8 @@ _
TOKENIZER_INFIXES = r'''
\.\.\.
\.\.\.+
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])

View File

@ -1,39 +1,57 @@
# encoding: utf8
from __future__ import unicode_literals
import re
# improved list from Stone, Denis, Kwantes (2010)
STOP_WORDS = set("""
a about above across after afterwards again against all almost alone
along already also although always am among amongst amoungst amount
an and another any anyhow anyone anything anyway anywhere are around
as at back be became because become becomes becoming been before
beforehand behind being below beside besides between beyond bill
both bottom but by call can cannot cant co computer con could couldnt
cry de describe detail did didn do does doesn doing don done down due
during each eg eight either eleven else elsewhere empty enough etc
even ever every everyone everything everywhere except few fifteen
fify fill find fire first five for former formerly forty found four
from front full further get give go had has hasnt have he hence her
here hereafter hereby herein hereupon hers herself him himself his
how however hundred i ie if in inc indeed interest into is it its
itself keep last latter latterly least less ltd just kg km made make
many may me meanwhile might mill mine more moreover most mostly move
much must my myself name namely neither never nevertheless next nine
no nobody none noone nor not nothing now nowhere of off often on once
one only onto or other others otherwise our ours ourselves out over
own part per perhaps please put rather re quite rather really regarding
same say see seem seemed seeming seems serious several she should
show side since sincere six sixty so some somehow someone something
sometime sometimes somewhere still such system take ten than that the
their them themselves then thence there thereafter thereby therefore
therein thereupon these they thick thin third this those though three
through throughout thru thus to together too top toward towards twelve
twenty two un under until up unless upon us used using various very
very via was we well were what whatever when whence whenever where whereafter
whereas whereby wherein whereupon wherever whether which while whither
who whoever whole whom whose why will with within without would yet you
your yours yourself yourselves
a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred
i if in inc indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much must my myself
name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side since six sixty so some somehow someone something sometime sometimes somewhere still such
take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they third this those though three through throughout thru thus to together too top toward towards twelve twenty two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split())
@ -98,17 +116,73 @@ TAG_MAP = {
"HVS": {"pos": "verb"}
}
TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- .... ...'''.split()
TOKENIZER_PREFIXES = r'''
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...
'''.strip().split('\n')
TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ''' 's 'S s S '''
r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]”"'%\)])\. '''
r'''(?<=[0-9])km''').strip().split()
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km
'''.strip().split('\n')
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
TOKENIZER_INFIXES = r'''
\.\.\.+
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])
(?<=[a-zA-Z])--(?=[a-zA-z])
(?<=[0-9])-(?=[0-9])
(?<=[A-Za-z]),(?=[A-Za-z])
'''.strip().split('\n')
TOKENIZER_EXCEPTIONS = {