mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Document regex utilities [ci skip]
This commit is contained in:
parent
cd4bc6757b
commit
1ea1bc98e7
|
@ -315,6 +315,11 @@ def read_regex(path):
|
|||
|
||||
|
||||
def compile_prefix_regex(entries):
|
||||
"""Compile a list of prefix rules into a regex object.
|
||||
|
||||
entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES.
|
||||
RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search.
|
||||
"""
|
||||
if "(" in entries:
|
||||
# Handle deprecated data
|
||||
expression = "|".join(
|
||||
|
@ -327,11 +332,21 @@ def compile_prefix_regex(entries):
|
|||
|
||||
|
||||
def compile_suffix_regex(entries):
|
||||
"""Compile a list of suffix rules into a regex object.
|
||||
|
||||
entries (tuple): The suffix rules, e.g. spacy.lang.punctuation.TOKENIZER_SUFFIXES.
|
||||
RETURNS (regex object): The regex object. to be used for Tokenizer.suffix_search.
|
||||
"""
|
||||
expression = "|".join([piece + "$" for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
def compile_infix_regex(entries):
|
||||
"""Compile a list of infix rules into a regex object.
|
||||
|
||||
entries (tuple): The infix rules, e.g. spacy.lang.punctuation.TOKENIZER_INFIXES.
|
||||
RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
|
||||
"""
|
||||
expression = "|".join([piece for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
|
|
@ -504,6 +504,57 @@ an error if key doesn't match `ORTH` values.
|
|||
| `*addition_dicts` | dicts | Exception dictionaries to add to the base exceptions, in order. |
|
||||
| **RETURNS** | dict | Combined tokenizer exceptions. |
|
||||
|
||||
### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
|
||||
|
||||
Compile a sequence of prefix rules into a regex object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> prefixes = ("§", "%", "=", r"\+")
|
||||
> prefix_regex = util.compile_prefix_regex(prefixes)
|
||||
> nlp.tokenizer.prefix_search = prefix_regex.search
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). |
|
||||
|
||||
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
|
||||
|
||||
Compile a sequence of suffix rules into a regex object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> suffixes = ("'s", "'S", r"(?<=[0-9])\+")
|
||||
> suffix_regex = util.compile_suffix_regex(suffixes)
|
||||
> nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). |
|
||||
|
||||
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
|
||||
|
||||
Compile a sequence of infix rules into a regex object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> infixes = ("…", "-", "—", r"(?<=[0-9])[+\-\*^](?=[0-9-])")
|
||||
> infix_regex = util.compile_infix_regex(infixes)
|
||||
> nlp.tokenizer.infix_finditer = infix_regex.finditer
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). |
|
||||
|
||||
### util.minibatch {#util.minibatch tag="function" new="2"}
|
||||
|
||||
Iterate over batches of items. `size` may be an iterator, so that batch-size can
|
||||
|
|
Loading…
Reference in New Issue
Block a user