spaCy/spacy/lang/pt/lex_attrs.py
Lucas Abbade be7fdc59d1 Update lex_attrs.py (#2307)
* Update lex_attrs.py

Fixed spelling mistakes of some numbers (according to Brazilian Portuguese).

* Update lex_attrs.py

As requested, I've included the correct spelling for both Brazilian Portuguese and Portuguese Portuguese.

I will advise however, that the two are separated in the future. Brazilian Portuguese is a very different language from the original one, although most of the writing is unified, the way people talk in both countries is radically different. Keeping both languages as one may lead to bigger issues in the future, especially when it comes to spell checking.
2018-05-09 20:49:31 +02:00

41 lines
1.5 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = ['zero', 'um', 'dois', 'três', 'quatro', 'cinco', 'seis', 'sete',
'oito', 'nove', 'dez', 'onze', 'doze', 'treze', 'catorze',
'quinze', 'dezesseis', 'dezasseis', 'dezessete', 'dezassete', 'dezoito', 'dezenove', 'dezanove', 'vinte',
'trinta', 'quarenta', 'cinquenta', 'sessenta', 'setenta',
'oitenta', 'noventa', 'cem', 'mil', 'milhão', 'bilhão', 'bilião', 'trilhão', 'trilião',
'quatrilhão']
_ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto',
'sétimo', 'oitavo', 'nono', 'décimo', 'vigésimo', 'trigésimo',
'quadragésimo', 'quinquagésimo', 'sexagésimo', 'septuagésimo',
'octogésimo', 'nonagésimo', 'centésimo', 'ducentésimo',
'trecentésimo', 'quadringentésimo', 'quingentésimo', 'sexcentésimo',
'septingentésimo', 'octingentésimo', 'nongentésimo', 'milésimo',
'milionésimo', 'bilionésimo']
def like_num(text):
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
if text.lower() in _ordinal_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}