spaCy/spacy/lang/fr/tokenizer_exceptions.py

import re

_hyphen = "-–—"
_apostrophe = "'`´’"

# fmt: off
_suffix_inversion = r"|".join([
    "je", "tu", "on", "il", "elle", "iel",
    "nous", "vous", "elles", "ils", "iels",
    # écoutons-les
    "moi", "toi", "lui", "leur",
    "eux",
    fr"en(?![{_hyphen}])", "y",
    # écoutons-les
    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
    # a-t-il, pourra-t'on, dis-m'en plus
    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
    "là", "ici",
])
_prefix_elision = r"|".join([
    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
])
# fmt: on

_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
_inversion = (
    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
)

TOKEN_MATCH = re.compile(
    r"(?iu)" + r"|".join([_inversion, _elision])
)
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 10:05:22 +03:00
+								import re
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
-												refactored french tokenizer

											
										
										
											2024-03-15 10:45:23 +03:00
+								_hyphen = "-–—"
 								_apostrophe = "'`´’"
 								# fmt: off
 								_suffix_inversion = r"|".join([
 								    "je", "tu", "on", "il", "elle", "iel",
 								    "nous", "vous", "elles", "ils", "iels",
 								    # écoutons-les
 								    "moi", "toi", "lui", "leur",
 								    "eux",
 								    fr"en(?![{_hyphen}])", "y",
 								    # écoutons-les
 								    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
 								    # a-t-il, pourra-t'on, dis-m'en plus
 								    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
 								    "là", "ici",
 								])
 								_prefix_elision = r"|".join([
 								    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
 								    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
 								    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
 								    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
 								])
 								# fmt: on
 								_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
 								_inversion = (
 								    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
 								)
-												Convert exceptions to Python list

											
										
										
											2017-02-24 20:22:40 +03:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								TOKEN_MATCH = re.compile(
-												refactored french tokenizer

											
										
										
											2024-03-15 10:45:23 +03:00
+								    r"(?iu)" + r"|".join([_inversion, _elision])
 								)
 								# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]