mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
be7fdc59d1
* Update lex_attrs.py Fixed spelling mistakes of some numbers (according to Brazilian Portuguese). * Update lex_attrs.py As requested, I've included the correct spelling for both Brazilian Portuguese and Portuguese Portuguese. I will advise however, that the two are separated in the future. Brazilian Portuguese is a very different language from the original one, although most of the writing is unified, the way people talk in both countries is radically different. Keeping both languages as one may lead to bigger issues in the future, especially when it comes to spell checking.
41 lines
1.5 KiB
Python
41 lines
1.5 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...attrs import LIKE_NUM
|
|
|
|
|
|
_num_words = ['zero', 'um', 'dois', 'três', 'quatro', 'cinco', 'seis', 'sete',
|
|
'oito', 'nove', 'dez', 'onze', 'doze', 'treze', 'catorze',
|
|
'quinze', 'dezesseis', 'dezasseis', 'dezessete', 'dezassete', 'dezoito', 'dezenove', 'dezanove', 'vinte',
|
|
'trinta', 'quarenta', 'cinquenta', 'sessenta', 'setenta',
|
|
'oitenta', 'noventa', 'cem', 'mil', 'milhão', 'bilhão', 'bilião', 'trilhão', 'trilião',
|
|
'quatrilhão']
|
|
|
|
_ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto',
|
|
'sétimo', 'oitavo', 'nono', 'décimo', 'vigésimo', 'trigésimo',
|
|
'quadragésimo', 'quinquagésimo', 'sexagésimo', 'septuagésimo',
|
|
'octogésimo', 'nonagésimo', 'centésimo', 'ducentésimo',
|
|
'trecentésimo', 'quadringentésimo', 'quingentésimo', 'sexcentésimo',
|
|
'septingentésimo', 'octingentésimo', 'nongentésimo', 'milésimo',
|
|
'milionésimo', 'bilionésimo']
|
|
|
|
|
|
def like_num(text):
|
|
text = text.replace(',', '').replace('.', '')
|
|
if text.isdigit():
|
|
return True
|
|
if text.count('/') == 1:
|
|
num, denom = text.split('/')
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
if text.lower() in _ordinal_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {
|
|
LIKE_NUM: like_num
|
|
}
|