mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	Merge pull request #772 from raphael0202/french-support
Add French tokenization support
This commit is contained in:
		
						commit
						c784b49d33
					
				| 
						 | 
				
			
			@ -1,12 +1,11 @@
 | 
			
		|||
# encoding: utf8
 | 
			
		||||
from __future__ import unicode_literals, print_function
 | 
			
		||||
 | 
			
		||||
from os import path
 | 
			
		||||
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from ..attrs import LANG
 | 
			
		||||
 | 
			
		||||
from .language_data import *
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class French(Language):
 | 
			
		||||
| 
						 | 
				
			
			@ -18,3 +17,4 @@ class French(Language):
 | 
			
		|||
 | 
			
		||||
        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
			
		||||
        stop_words = STOP_WORDS
 | 
			
		||||
        infixes = tuple(TOKENIZER_INFIXES)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,9 @@ from __future__ import unicode_literals
 | 
			
		|||
from .. import language_data as base
 | 
			
		||||
from ..language_data import strings_to_exc, update_exc
 | 
			
		||||
 | 
			
		||||
from .punctuation import ELISION
 | 
			
		||||
 | 
			
		||||
from ..symbols import *
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS)
 | 
			
		|||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 | 
			
		||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 | 
			
		||||
 | 
			
		||||
ABBREVIATIONS = {
 | 
			
		||||
    "janv.": [
 | 
			
		||||
        {LEMMA: "janvier", ORTH: "janv."}
 | 
			
		||||
    ],
 | 
			
		||||
    "févr.": [
 | 
			
		||||
        {LEMMA: "février", ORTH: "févr."}
 | 
			
		||||
    ],
 | 
			
		||||
    "avr.": [
 | 
			
		||||
        {LEMMA: "avril", ORTH: "avr."}
 | 
			
		||||
    ],
 | 
			
		||||
    "juill.": [
 | 
			
		||||
        {LEMMA: "juillet", ORTH: "juill."}
 | 
			
		||||
    ],
 | 
			
		||||
    "sept.": [
 | 
			
		||||
        {LEMMA: "septembre", ORTH: "sept."}
 | 
			
		||||
    ],
 | 
			
		||||
    "oct.": [
 | 
			
		||||
        {LEMMA: "octobre", ORTH: "oct."}
 | 
			
		||||
    ],
 | 
			
		||||
    "nov.": [
 | 
			
		||||
        {LEMMA: "novembre", ORTH: "nov."}
 | 
			
		||||
    ],
 | 
			
		||||
    "déc.": [
 | 
			
		||||
        {LEMMA: "décembre", ORTH: "déc."}
 | 
			
		||||
    ],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
 | 
			
		||||
                           "prud'homme", "prud'hommes",
 | 
			
		||||
                           "prud'homal", "prud'homaux", "prud'homale",
 | 
			
		||||
                           "prud'homales",
 | 
			
		||||
                           "prud'hommal", "prud'hommaux", "prud'hommale",
 | 
			
		||||
                           "prud'hommales",
 | 
			
		||||
                           "prud'homie", "prud'homies",
 | 
			
		||||
                           "prud'hommesque", "prud'hommesques",
 | 
			
		||||
                           "prud'hommesquement"]
 | 
			
		||||
 | 
			
		||||
INFIXES_EXCEPTIONS = []
 | 
			
		||||
for elision_char in ELISION:
 | 
			
		||||
    INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
 | 
			
		||||
                           for infix in INFIXES_EXCEPTIONS_BASE]
 | 
			
		||||
 | 
			
		||||
INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
 | 
			
		||||
 | 
			
		||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
 | 
			
		||||
update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										16
									
								
								spacy/fr/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								spacy/fr/punctuation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,16 @@
 | 
			
		|||
# encoding: utf8
 | 
			
		||||
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_ELISION = " ' ’ "
 | 
			
		||||
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
 | 
			
		||||
 | 
			
		||||
TOKENIZER_INFIXES += [
 | 
			
		||||
    r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
 | 
			
		||||
| 
						 | 
				
			
			@ -52,6 +52,11 @@ def de_tokenizer():
 | 
			
		|||
    return German.Defaults.create_tokenizer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def fr_tokenizer():
 | 
			
		||||
    return French.Defaults.create_tokenizer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def hu_tokenizer():
 | 
			
		||||
    return Hungarian.Defaults.create_tokenizer()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										1
									
								
								spacy/tests/fr/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								spacy/tests/fr/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
							
								
								
									
										30
									
								
								spacy/tests/fr/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								spacy/tests/fr/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,30 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('text', ["aujourd'hui", "Aujourd'hui", "prud'hommes",
 | 
			
		||||
                                  "prud’hommal"])
 | 
			
		||||
def test_tokenizer_infix_exceptions(fr_tokenizer, text):
 | 
			
		||||
    tokens = fr_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"),
 | 
			
		||||
                                        ("juill.", "juillet"),
 | 
			
		||||
                                        ("sept.", "septembre")])
 | 
			
		||||
def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
 | 
			
		||||
    tokens = fr_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 1
 | 
			
		||||
    assert tokens[0].lemma_ == lemma
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_tokenizer_handles_exc_in_text(fr_tokenizer):
 | 
			
		||||
    text = "Je suis allé au mois de janv. aux prud’hommes."
 | 
			
		||||
    tokens = fr_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 10
 | 
			
		||||
    assert tokens[6].text == "janv."
 | 
			
		||||
    assert tokens[6].lemma_ == "janvier"
 | 
			
		||||
    assert tokens[8].text == "prud’hommes"
 | 
			
		||||
							
								
								
									
										19
									
								
								spacy/tests/fr/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/tests/fr/test_text.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,19 @@
 | 
			
		|||
# encoding: utf8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_tokenizer_handles_long_text(fr_tokenizer):
 | 
			
		||||
    text = """L'histoire du TAL commence dans les années 1950, bien que l'on puisse \
 | 
			
		||||
trouver des travaux antérieurs. En 1950, Alan Turing éditait un article \
 | 
			
		||||
célèbre sous le titre « Computing machinery and intelligence » qui propose ce \
 | 
			
		||||
qu'on appelle à présent le test de Turing comme critère d'intelligence. \
 | 
			
		||||
Ce critère dépend de la capacité d'un programme informatique de personnifier \
 | 
			
		||||
un humain dans une conversation écrite en temps réel, de façon suffisamment \
 | 
			
		||||
convaincante que l'interlocuteur humain ne peut distinguer sûrement — sur la \
 | 
			
		||||
base du seul contenu de la conversation — s'il interagit avec un programme \
 | 
			
		||||
ou avec un autre vrai humain."""
 | 
			
		||||
 | 
			
		||||
    tokens = fr_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 113
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user