mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	Merge pull request #772 from raphael0202/french-support
Add French tokenization support
This commit is contained in:
		
						commit
						c784b49d33
					
				| 
						 | 
					@ -1,12 +1,11 @@
 | 
				
			||||||
# encoding: utf8
 | 
					# encoding: utf8
 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..attrs import LANG
 | 
					from ..attrs import LANG
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .language_data import *
 | 
					from .language_data import *
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class French(Language):
 | 
					class French(Language):
 | 
				
			||||||
| 
						 | 
					@ -18,3 +17,4 @@ class French(Language):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
        stop_words = STOP_WORDS
 | 
					        stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					        infixes = tuple(TOKENIZER_INFIXES)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,9 @@ from __future__ import unicode_literals
 | 
				
			||||||
from .. import language_data as base
 | 
					from .. import language_data as base
 | 
				
			||||||
from ..language_data import strings_to_exc, update_exc
 | 
					from ..language_data import strings_to_exc, update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .punctuation import ELISION
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..symbols import *
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS)
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 | 
					TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 | 
				
			||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 | 
					update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ABBREVIATIONS = {
 | 
				
			||||||
 | 
					    "janv.": [
 | 
				
			||||||
 | 
					        {LEMMA: "janvier", ORTH: "janv."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "févr.": [
 | 
				
			||||||
 | 
					        {LEMMA: "février", ORTH: "févr."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "avr.": [
 | 
				
			||||||
 | 
					        {LEMMA: "avril", ORTH: "avr."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "juill.": [
 | 
				
			||||||
 | 
					        {LEMMA: "juillet", ORTH: "juill."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "sept.": [
 | 
				
			||||||
 | 
					        {LEMMA: "septembre", ORTH: "sept."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "oct.": [
 | 
				
			||||||
 | 
					        {LEMMA: "octobre", ORTH: "oct."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "nov.": [
 | 
				
			||||||
 | 
					        {LEMMA: "novembre", ORTH: "nov."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "déc.": [
 | 
				
			||||||
 | 
					        {LEMMA: "décembre", ORTH: "déc."}
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
 | 
				
			||||||
 | 
					                           "prud'homme", "prud'hommes",
 | 
				
			||||||
 | 
					                           "prud'homal", "prud'homaux", "prud'homale",
 | 
				
			||||||
 | 
					                           "prud'homales",
 | 
				
			||||||
 | 
					                           "prud'hommal", "prud'hommaux", "prud'hommale",
 | 
				
			||||||
 | 
					                           "prud'hommales",
 | 
				
			||||||
 | 
					                           "prud'homie", "prud'homies",
 | 
				
			||||||
 | 
					                           "prud'hommesque", "prud'hommesques",
 | 
				
			||||||
 | 
					                           "prud'hommesquement"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INFIXES_EXCEPTIONS = []
 | 
				
			||||||
 | 
					for elision_char in ELISION:
 | 
				
			||||||
 | 
					    INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
 | 
				
			||||||
 | 
					                           for infix in INFIXES_EXCEPTIONS_BASE]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
 | 
				
			||||||
 | 
					update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 | 
					__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										16
									
								
								spacy/fr/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								spacy/fr/punctuation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,16 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_ELISION = " ' ’ "
 | 
				
			||||||
 | 
					ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_INFIXES += [
 | 
				
			||||||
 | 
					    r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
 | 
				
			||||||
| 
						 | 
					@ -52,6 +52,11 @@ def de_tokenizer():
 | 
				
			||||||
    return German.Defaults.create_tokenizer()
 | 
					    return German.Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def fr_tokenizer():
 | 
				
			||||||
 | 
					    return French.Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def hu_tokenizer():
 | 
					def hu_tokenizer():
 | 
				
			||||||
    return Hungarian.Defaults.create_tokenizer()
 | 
					    return Hungarian.Defaults.create_tokenizer()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/tests/fr/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								spacy/tests/fr/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
							
								
								
									
										30
									
								
								spacy/tests/fr/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								spacy/tests/fr/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,30 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text', ["aujourd'hui", "Aujourd'hui", "prud'hommes",
 | 
				
			||||||
 | 
					                                  "prud’hommal"])
 | 
				
			||||||
 | 
					def test_tokenizer_infix_exceptions(fr_tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = fr_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"),
 | 
				
			||||||
 | 
					                                        ("juill.", "juillet"),
 | 
				
			||||||
 | 
					                                        ("sept.", "septembre")])
 | 
				
			||||||
 | 
					def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
 | 
				
			||||||
 | 
					    tokens = fr_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
 | 
					    assert tokens[0].lemma_ == lemma
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_tokenizer_handles_exc_in_text(fr_tokenizer):
 | 
				
			||||||
 | 
					    text = "Je suis allé au mois de janv. aux prud’hommes."
 | 
				
			||||||
 | 
					    tokens = fr_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 10
 | 
				
			||||||
 | 
					    assert tokens[6].text == "janv."
 | 
				
			||||||
 | 
					    assert tokens[6].lemma_ == "janvier"
 | 
				
			||||||
 | 
					    assert tokens[8].text == "prud’hommes"
 | 
				
			||||||
							
								
								
									
										19
									
								
								spacy/tests/fr/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/tests/fr/test_text.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,19 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_tokenizer_handles_long_text(fr_tokenizer):
 | 
				
			||||||
 | 
					    text = """L'histoire du TAL commence dans les années 1950, bien que l'on puisse \
 | 
				
			||||||
 | 
					trouver des travaux antérieurs. En 1950, Alan Turing éditait un article \
 | 
				
			||||||
 | 
					célèbre sous le titre « Computing machinery and intelligence » qui propose ce \
 | 
				
			||||||
 | 
					qu'on appelle à présent le test de Turing comme critère d'intelligence. \
 | 
				
			||||||
 | 
					Ce critère dépend de la capacité d'un programme informatique de personnifier \
 | 
				
			||||||
 | 
					un humain dans une conversation écrite en temps réel, de façon suffisamment \
 | 
				
			||||||
 | 
					convaincante que l'interlocuteur humain ne peut distinguer sûrement — sur la \
 | 
				
			||||||
 | 
					base du seul contenu de la conversation — s'il interagit avec un programme \
 | 
				
			||||||
 | 
					ou avec un autre vrai humain."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tokens = fr_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 113
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user