mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Update Portuguese language data
This commit is contained in:
		
							parent
							
								
									c0c5f31950
								
							
						
					
					
						commit
						7bfe2d4abc
					
				| 
						 | 
					@ -3,25 +3,37 @@ from __future__ import unicode_literals, print_function
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..attrs import LANG
 | 
					 | 
				
			||||||
from . import language_data
 | 
					from . import language_data
 | 
				
			||||||
 | 
					from ..attrs import LANG
 | 
				
			||||||
 | 
					from ..util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..language_data import EMOTICONS
 | 
				
			||||||
 | 
					from .language_data import ORTH_ONLY
 | 
				
			||||||
 | 
					from .language_data import strings_to_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
 | 
					TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
 | 
				
			||||||
 | 
					TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
 | 
				
			||||||
 | 
					TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
 | 
				
			||||||
 | 
					TAG_MAP = dict(language_data.TAG_MAP)
 | 
				
			||||||
 | 
					STOP_WORDS = set(language_data.STOP_WORDS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
 | 
				
			||||||
 | 
					update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Portuguese(Language):
 | 
					class Portuguese(Language):
 | 
				
			||||||
    lang = 'pt'
 | 
					    lang = 'pt'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class Defaults(Language.Defaults):
 | 
					    class Defaults(Language.Defaults):
 | 
				
			||||||
        tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
 | 
					 | 
				
			||||||
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
				
			||||||
        lex_attr_getters[LANG] = lambda text: 'pt'
 | 
					        lex_attr_getters[LANG] = lambda text: 'pt'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        prefixes = tuple(language_data.TOKENIZER_PREFIXES)
 | 
					        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
        
 | 
					        prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
        suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
 | 
					        suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
        
 | 
					        infixes = TOKENIZER_INFIXES
 | 
				
			||||||
        infixes = tuple(language_data.TOKENIZER_INFIXES)
 | 
					        tag_map = TAG_MAP
 | 
				
			||||||
 | 
					        stop_words = STOP_WORDS
 | 
				
			||||||
        tag_map = dict(language_data.TAG_MAP)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        stop_words = set(language_data.STOP_WORDS)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,356 +1,33 @@
 | 
				
			||||||
# encoding: utf8
 | 
					# encoding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
import re
 | 
					
 | 
				
			||||||
 | 
					from ..symbols import *
 | 
				
			||||||
 | 
					from ..language_data import TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					from ..language_data import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from ..language_data import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STOP_WORDS = set()
 | 
					def strings_to_exc(orths):
 | 
				
			||||||
 | 
					    return {orth: [{ORTH: orth}] for orth in orths}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_PREFIXES = map(re.escape, r'''
 | 
					PRON_LEMMA = "-PRON-"
 | 
				
			||||||
,
 | 
					 | 
				
			||||||
"
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
[
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
<
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
$
 | 
					 | 
				
			||||||
£
 | 
					 | 
				
			||||||
„
 | 
					 | 
				
			||||||
“
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
``
 | 
					 | 
				
			||||||
`
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
US$
 | 
					 | 
				
			||||||
C$
 | 
					 | 
				
			||||||
A$
 | 
					 | 
				
			||||||
a-
 | 
					 | 
				
			||||||
‘
 | 
					 | 
				
			||||||
....
 | 
					 | 
				
			||||||
...
 | 
					 | 
				
			||||||
‚
 | 
					 | 
				
			||||||
»
 | 
					 | 
				
			||||||
_
 | 
					 | 
				
			||||||
§
 | 
					 | 
				
			||||||
'''.strip().split('\n'))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
TOKENIZER_SUFFIXES = r'''
 | 
					 | 
				
			||||||
,
 | 
					 | 
				
			||||||
\"
 | 
					 | 
				
			||||||
\)
 | 
					 | 
				
			||||||
\]
 | 
					 | 
				
			||||||
\}
 | 
					 | 
				
			||||||
\*
 | 
					 | 
				
			||||||
\!
 | 
					 | 
				
			||||||
\?
 | 
					 | 
				
			||||||
%
 | 
					 | 
				
			||||||
\$
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
:
 | 
					 | 
				
			||||||
;
 | 
					 | 
				
			||||||
'
 | 
					 | 
				
			||||||
”
 | 
					 | 
				
			||||||
“
 | 
					 | 
				
			||||||
«
 | 
					 | 
				
			||||||
_
 | 
					 | 
				
			||||||
''
 | 
					 | 
				
			||||||
's
 | 
					 | 
				
			||||||
'S
 | 
					 | 
				
			||||||
’s
 | 
					 | 
				
			||||||
’S
 | 
					 | 
				
			||||||
’
 | 
					 | 
				
			||||||
‘
 | 
					 | 
				
			||||||
°
 | 
					 | 
				
			||||||
€
 | 
					 | 
				
			||||||
\.\.
 | 
					 | 
				
			||||||
\.\.\.
 | 
					 | 
				
			||||||
\.\.\.\.
 | 
					 | 
				
			||||||
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
 | 
					 | 
				
			||||||
\-\-
 | 
					 | 
				
			||||||
´
 | 
					 | 
				
			||||||
(?<=[0-9])km²
 | 
					 | 
				
			||||||
(?<=[0-9])m²
 | 
					 | 
				
			||||||
(?<=[0-9])cm²
 | 
					 | 
				
			||||||
(?<=[0-9])mm²
 | 
					 | 
				
			||||||
(?<=[0-9])km³
 | 
					 | 
				
			||||||
(?<=[0-9])m³
 | 
					 | 
				
			||||||
(?<=[0-9])cm³
 | 
					 | 
				
			||||||
(?<=[0-9])mm³
 | 
					 | 
				
			||||||
(?<=[0-9])ha
 | 
					 | 
				
			||||||
(?<=[0-9])km
 | 
					 | 
				
			||||||
(?<=[0-9])m
 | 
					 | 
				
			||||||
(?<=[0-9])cm
 | 
					 | 
				
			||||||
(?<=[0-9])mm
 | 
					 | 
				
			||||||
(?<=[0-9])µm
 | 
					 | 
				
			||||||
(?<=[0-9])nm
 | 
					 | 
				
			||||||
(?<=[0-9])yd
 | 
					 | 
				
			||||||
(?<=[0-9])in
 | 
					 | 
				
			||||||
(?<=[0-9])ft
 | 
					 | 
				
			||||||
(?<=[0-9])kg
 | 
					 | 
				
			||||||
(?<=[0-9])g
 | 
					 | 
				
			||||||
(?<=[0-9])mg
 | 
					 | 
				
			||||||
(?<=[0-9])µg
 | 
					 | 
				
			||||||
(?<=[0-9])t
 | 
					 | 
				
			||||||
(?<=[0-9])lb
 | 
					 | 
				
			||||||
(?<=[0-9])oz
 | 
					 | 
				
			||||||
(?<=[0-9])m/s
 | 
					 | 
				
			||||||
(?<=[0-9])km/h
 | 
					 | 
				
			||||||
(?<=[0-9])mph
 | 
					 | 
				
			||||||
(?<=[0-9])°C
 | 
					 | 
				
			||||||
(?<=[0-9])°K
 | 
					 | 
				
			||||||
(?<=[0-9])°F
 | 
					 | 
				
			||||||
(?<=[0-9])hPa
 | 
					 | 
				
			||||||
(?<=[0-9])Pa
 | 
					 | 
				
			||||||
(?<=[0-9])mbar
 | 
					 | 
				
			||||||
(?<=[0-9])mb
 | 
					 | 
				
			||||||
(?<=[0-9])T
 | 
					 | 
				
			||||||
(?<=[0-9])G
 | 
					 | 
				
			||||||
(?<=[0-9])M
 | 
					 | 
				
			||||||
(?<=[0-9])K
 | 
					 | 
				
			||||||
(?<=[0-9])kb
 | 
					 | 
				
			||||||
'''.strip().split('\n')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
 | 
					 | 
				
			||||||
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
 | 
					 | 
				
			||||||
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
TOKENIZER_EXCEPTIONS = {
 | 
					 | 
				
			||||||
    "vs.": [{"F": "vs."}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "''": [{"F": "''"}],
 | 
					 | 
				
			||||||
    "—": [{"F": "—", "L": "--", "pos": "$,"}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "a.m.": [{"F": "a.m."}],
 | 
					 | 
				
			||||||
    "p.m.": [{"F": "p.m."}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "1a.m.": [{"F": "1"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "2a.m.": [{"F": "2"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "3a.m.": [{"F": "3"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "4a.m.": [{"F": "4"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "5a.m.": [{"F": "5"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "6a.m.": [{"F": "6"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "7a.m.": [{"F": "7"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "8a.m.": [{"F": "8"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "9a.m.": [{"F": "9"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "10a.m.": [{"F": "10"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "11a.m.": [{"F": "11"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "12a.m.": [{"F": "12"}, {"F": "a.m."}],
 | 
					 | 
				
			||||||
    "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
    "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "p.m.": [{"F": "p.m."}],
 | 
					 | 
				
			||||||
    "1p.m.": [{"F": "1"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "2p.m.": [{"F": "2"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "3p.m.": [{"F": "3"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "4p.m.": [{"F": "4"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "5p.m.": [{"F": "5"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "6p.m.": [{"F": "6"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "7p.m.": [{"F": "7"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "8p.m.": [{"F": "8"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "9p.m.": [{"F": "9"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "10p.m.": [{"F": "10"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "11p.m.": [{"F": "11"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "12p.m.": [{"F": "12"}, {"F": "p.m."}],
 | 
					 | 
				
			||||||
    "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
    "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "Ala.": [{"F": "Ala."}],
 | 
					 | 
				
			||||||
    "Ariz.": [{"F": "Ariz."}],
 | 
					 | 
				
			||||||
    "Ark.": [{"F":  "Ark."}],
 | 
					 | 
				
			||||||
    "Calif.": [{"F": "Calif."}],
 | 
					 | 
				
			||||||
    "Colo.": [{"F": "Colo."}],
 | 
					 | 
				
			||||||
    "Conn.": [{"F": "Conn."}],
 | 
					 | 
				
			||||||
    "Del.": [{"F":  "Del."}],
 | 
					 | 
				
			||||||
    "D.C.": [{"F": "D.C."}],
 | 
					 | 
				
			||||||
    "Fla.": [{"F":  "Fla."}],
 | 
					 | 
				
			||||||
    "Ga.": [{"F": "Ga."}],
 | 
					 | 
				
			||||||
    "Ill.": [{"F": "Ill."}],
 | 
					 | 
				
			||||||
    "Ind.": [{"F": "Ind."}],
 | 
					 | 
				
			||||||
    "Kans.": [{"F": "Kans."}],
 | 
					 | 
				
			||||||
    "Kan.": [{"F": "Kan."}],
 | 
					 | 
				
			||||||
    "Ky.": [{"F": "Ky."}],
 | 
					 | 
				
			||||||
    "La.": [{"F": "La."}],
 | 
					 | 
				
			||||||
    "Md.": [{"F": "Md."}],
 | 
					 | 
				
			||||||
    "Mass.": [{"F": "Mass."}],
 | 
					 | 
				
			||||||
    "Mich.": [{"F": "Mich."}],
 | 
					 | 
				
			||||||
    "Minn.": [{"F": "Minn."}],
 | 
					 | 
				
			||||||
    "Miss.": [{"F": "Miss."}],
 | 
					 | 
				
			||||||
    "Mo.": [{"F": "Mo."}],
 | 
					 | 
				
			||||||
    "Mont.": [{"F": "Mont."}],
 | 
					 | 
				
			||||||
    "Nebr.": [{"F": "Nebr."}],
 | 
					 | 
				
			||||||
    "Neb.": [{"F": "Neb."}],
 | 
					 | 
				
			||||||
    "Nev.": [{"F":  "Nev."}],
 | 
					 | 
				
			||||||
    "N.H.": [{"F": "N.H."}],
 | 
					 | 
				
			||||||
    "N.J.": [{"F": "N.J."}],
 | 
					 | 
				
			||||||
    "N.M.": [{"F": "N.M."}],
 | 
					 | 
				
			||||||
    "N.Y.": [{"F": "N.Y."}],
 | 
					 | 
				
			||||||
    "N.C.": [{"F": "N.C."}],
 | 
					 | 
				
			||||||
    "N.D.": [{"F": "N.D."}],
 | 
					 | 
				
			||||||
    "Okla.": [{"F": "Okla."}],
 | 
					 | 
				
			||||||
    "Ore.": [{"F": "Ore."}],
 | 
					 | 
				
			||||||
    "Pa.": [{"F": "Pa."}],
 | 
					 | 
				
			||||||
    "Tenn.": [{"F": "Tenn."}],
 | 
					 | 
				
			||||||
    "Va.": [{"F": "Va."}],
 | 
					 | 
				
			||||||
    "Wash.": [{"F": "Wash."}],
 | 
					 | 
				
			||||||
    "Wis.": [{"F": "Wis."}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    ":)":  [{"F": ":)"}],
 | 
					 | 
				
			||||||
    "<3":  [{"F": "<3"}],
 | 
					 | 
				
			||||||
    ";)":  [{"F": ";)"}],
 | 
					 | 
				
			||||||
    "(:":  [{"F": "(:"}],
 | 
					 | 
				
			||||||
    ":(":  [{"F": ":("}],
 | 
					 | 
				
			||||||
    "-_-": [{"F": "-_-"}],
 | 
					 | 
				
			||||||
    "=)":  [{"F": "=)"}],
 | 
					 | 
				
			||||||
    ":/":  [{"F": ":/"}],
 | 
					 | 
				
			||||||
    ":>":  [{"F": ":>"}],
 | 
					 | 
				
			||||||
    ";-)": [{"F": ";-)"}],
 | 
					 | 
				
			||||||
    ":Y":  [{"F": ":Y"}],
 | 
					 | 
				
			||||||
    ":P":  [{"F": ":P"}],
 | 
					 | 
				
			||||||
    ":-P": [{"F": ":-P"}],
 | 
					 | 
				
			||||||
    ":3":  [{"F": ":3"}],
 | 
					 | 
				
			||||||
    "=3":  [{"F": "=3"}],
 | 
					 | 
				
			||||||
    "xD":  [{"F": "xD"}],
 | 
					 | 
				
			||||||
    "^_^": [{"F": "^_^"}],
 | 
					 | 
				
			||||||
    "=]":  [{"F": "=]"}],
 | 
					 | 
				
			||||||
    "=D":  [{"F": "=D"}],
 | 
					 | 
				
			||||||
    "<333":    [{"F": "<333"}],
 | 
					 | 
				
			||||||
    ":))": [{"F": ":))"}],
 | 
					 | 
				
			||||||
    ":0":  [{"F": ":0"}],
 | 
					 | 
				
			||||||
    "-__-":    [{"F": "-__-"}],
 | 
					 | 
				
			||||||
    "xDD": [{"F": "xDD"}],
 | 
					 | 
				
			||||||
    "o_o": [{"F": "o_o"}],
 | 
					 | 
				
			||||||
    "o_O": [{"F": "o_O"}],
 | 
					 | 
				
			||||||
    "V_V": [{"F": "V_V"}],
 | 
					 | 
				
			||||||
    "=[[": [{"F": "=[["}],
 | 
					 | 
				
			||||||
    "<33": [{"F": "<33"}],
 | 
					 | 
				
			||||||
    ";p":  [{"F": ";p"}],
 | 
					 | 
				
			||||||
    ";D":  [{"F": ";D"}],
 | 
					 | 
				
			||||||
    ";-p": [{"F": ";-p"}],
 | 
					 | 
				
			||||||
    ";(":  [{"F": ";("}],
 | 
					 | 
				
			||||||
    ":p":  [{"F": ":p"}],
 | 
					 | 
				
			||||||
    ":]":  [{"F": ":]"}],
 | 
					 | 
				
			||||||
    ":O":  [{"F": ":O"}],
 | 
					 | 
				
			||||||
    ":-/": [{"F": ":-/"}],
 | 
					 | 
				
			||||||
    ":-)": [{"F": ":-)"}],
 | 
					 | 
				
			||||||
    ":(((":    [{"F": ":((("}],
 | 
					 | 
				
			||||||
    ":((": [{"F": ":(("}],
 | 
					 | 
				
			||||||
    ":')": [{"F": ":')"}],
 | 
					 | 
				
			||||||
    "(^_^)":   [{"F": "(^_^)"}],
 | 
					 | 
				
			||||||
    "(=":  [{"F": "(="}],
 | 
					 | 
				
			||||||
    "o.O": [{"F": "o.O"}],
 | 
					 | 
				
			||||||
    "\")": [{"F": "\")"}],
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    "a.": [{"F": "a."}],
 | 
					 | 
				
			||||||
    "b.": [{"F": "b."}],
 | 
					 | 
				
			||||||
    "c.": [{"F": "c."}],
 | 
					 | 
				
			||||||
    "d.": [{"F": "d."}],
 | 
					 | 
				
			||||||
    "e.": [{"F": "e."}],
 | 
					 | 
				
			||||||
    "f.": [{"F": "f."}],
 | 
					 | 
				
			||||||
    "g.": [{"F": "g."}],
 | 
					 | 
				
			||||||
    "h.": [{"F": "h."}],
 | 
					 | 
				
			||||||
    "i.": [{"F": "i."}],
 | 
					 | 
				
			||||||
    "j.": [{"F": "j."}],
 | 
					 | 
				
			||||||
    "k.": [{"F": "k."}],
 | 
					 | 
				
			||||||
    "l.": [{"F": "l."}],
 | 
					 | 
				
			||||||
    "m.": [{"F": "m."}],
 | 
					 | 
				
			||||||
    "n.": [{"F": "n."}],
 | 
					 | 
				
			||||||
    "o.": [{"F": "o."}],
 | 
					 | 
				
			||||||
    "p.": [{"F": "p."}],
 | 
					 | 
				
			||||||
    "q.": [{"F": "q."}],
 | 
					 | 
				
			||||||
    "r.": [{"F": "r."}],
 | 
					 | 
				
			||||||
    "s.": [{"F": "s."}],
 | 
					 | 
				
			||||||
    "t.": [{"F": "t."}],
 | 
					 | 
				
			||||||
    "u.": [{"F": "u."}],
 | 
					 | 
				
			||||||
    "v.": [{"F": "v."}],
 | 
					 | 
				
			||||||
    "w.": [{"F": "w."}],
 | 
					 | 
				
			||||||
    "x.": [{"F": "x."}],
 | 
					 | 
				
			||||||
    "y.": [{"F": "y."}],
 | 
					 | 
				
			||||||
    "z.": [{"F": "z."}],
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TAG_MAP = {
 | 
					TAG_MAP = {
 | 
				
			||||||
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
 | 
					
 | 
				
			||||||
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
 | 
					}
 | 
				
			||||||
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
 | 
					
 | 
				
			||||||
"ADJA":	{"pos": "ADJ"},
 | 
					STOP_WORDS = set("""
 | 
				
			||||||
"ADJD":	{"pos": "ADJ", "Variant": "Short"},
 | 
					
 | 
				
			||||||
"ADV":	{"pos": "ADV"},
 | 
					""".split())
 | 
				
			||||||
"APPO":	{"pos": "ADP", "AdpType": "Post"},
 | 
					
 | 
				
			||||||
"APPR":	{"pos": "ADP", "AdpType": "Prep"},
 | 
					
 | 
				
			||||||
"APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
 | 
					TOKENIZER_EXCEPTIONS = {
 | 
				
			||||||
"APZR":	{"pos": "ADP", "AdpType": "Circ"},
 | 
					
 | 
				
			||||||
"ART":	{"pos": "DET", "PronType": "Art"},
 | 
					}
 | 
				
			||||||
"CARD":	{"pos": "NUM", "NumType": "Card"},
 | 
					
 | 
				
			||||||
"FM":	{"pos": "X", "Foreign": "Yes"},
 | 
					
 | 
				
			||||||
"ITJ":	{"pos": "INTJ"},
 | 
					ORTH_ONLY = {
 | 
				
			||||||
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
 | 
					
 | 
				
			||||||
"KON": {"pos": "CONJ"},
 | 
					 | 
				
			||||||
"KOUI":	{"pos": "SCONJ"},
 | 
					 | 
				
			||||||
"KOUS":	{"pos": "SCONJ"},
 | 
					 | 
				
			||||||
"NE": {"pos": "PROPN"},
 | 
					 | 
				
			||||||
"NNE": {"pos": "PROPN"},
 | 
					 | 
				
			||||||
"NN": {"pos": "NOUN"},
 | 
					 | 
				
			||||||
"PAV": {"pos": "ADV", "PronType": "Dem"},
 | 
					 | 
				
			||||||
"PROAV": {"pos": "ADV", "PronType": "Dem"},
 | 
					 | 
				
			||||||
"PDAT":	{"pos": "DET", "PronType": "Dem"},
 | 
					 | 
				
			||||||
"PDS": {"pos": "PRON", "PronType": "Dem"},
 | 
					 | 
				
			||||||
"PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
 | 
					 | 
				
			||||||
"PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
 | 
					 | 
				
			||||||
"PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
 | 
					 | 
				
			||||||
"PPER":	{"pos": "PRON", "PronType": "Prs"},
 | 
					 | 
				
			||||||
"PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
 | 
					 | 
				
			||||||
"PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
 | 
					 | 
				
			||||||
"PRELAT":	{"pos": "DET", "PronType": "Rel"},
 | 
					 | 
				
			||||||
"PRELS":	{"pos": "PRON", "PronType": "Rel"},
 | 
					 | 
				
			||||||
"PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
 | 
					 | 
				
			||||||
"PTKA":	{"pos": "PART"},
 | 
					 | 
				
			||||||
"PTKANT":	{"pos": "PART", "PartType": "Res"},
 | 
					 | 
				
			||||||
"PTKNEG":	{"pos": "PART", "Negative": "Neg"},
 | 
					 | 
				
			||||||
"PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
 | 
					 | 
				
			||||||
"PTKZU":	{"pos": "PART", "PartType": "Inf"},
 | 
					 | 
				
			||||||
"PWAT":	{"pos": "DET", "PronType": "Int"},
 | 
					 | 
				
			||||||
"PWAV":	{"pos": "ADV", "PronType": "Int"},
 | 
					 | 
				
			||||||
"PWS":	{"pos": "PRON", "PronType": "Int"},
 | 
					 | 
				
			||||||
"TRUNC":	{"pos": "X", "Hyph": "Yes"},
 | 
					 | 
				
			||||||
"VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
 | 
					 | 
				
			||||||
"VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
 | 
					 | 
				
			||||||
"VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
 | 
					 | 
				
			||||||
"VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
 | 
					 | 
				
			||||||
"VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
 | 
					 | 
				
			||||||
"VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
 | 
					 | 
				
			||||||
"VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
 | 
					 | 
				
			||||||
"VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
 | 
					 | 
				
			||||||
"VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
 | 
					 | 
				
			||||||
"VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 | 
					 | 
				
			||||||
"VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 | 
					 | 
				
			||||||
"VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
 | 
					 | 
				
			||||||
"XY":	{"pos": "X"},
 | 
					 | 
				
			||||||
"SP": {"pos": "SPACE"}
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user