mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Revise tokenization rules to match PTB. Rules are pretty messy around periods, need better support for these.
This commit is contained in:
		
							parent
							
								
									5fe5e6e66b
								
							
						
					
					
						commit
						677e111ee7
					
				| 
						 | 
					@ -11,3 +11,8 @@ $
 | 
				
			||||||
'
 | 
					'
 | 
				
			||||||
``
 | 
					``
 | 
				
			||||||
`
 | 
					`
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					US$
 | 
				
			||||||
 | 
					C$
 | 
				
			||||||
 | 
					A$
 | 
				
			||||||
 | 
					a-
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,99 +6,100 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
's  's
 | 
					's  's
 | 
				
			||||||
'S  'S
 | 
					'S  'S
 | 
				
			||||||
ain't   are not
 | 
					ain't   ai n't
 | 
				
			||||||
aren't  are not
 | 
					aren't  are n't
 | 
				
			||||||
can't   can not
 | 
					can't   ca n't
 | 
				
			||||||
cannot  can not
 | 
					cannot  can not
 | 
				
			||||||
could've    could have
 | 
					could've    could 've
 | 
				
			||||||
couldn't    could not
 | 
					couldn't    could n't
 | 
				
			||||||
couldn't've could not have
 | 
					couldn't've could n't 've
 | 
				
			||||||
didn't  did not
 | 
					didn't  did n't
 | 
				
			||||||
doesn't does not
 | 
					doesn't does n't
 | 
				
			||||||
don't   do not
 | 
					don't   do n't
 | 
				
			||||||
hadn't  had not
 | 
					hadn't  had n't
 | 
				
			||||||
hadn't've   had not have
 | 
					hadn't've   had n't 've
 | 
				
			||||||
hasn't  has not
 | 
					hasn't  has n't
 | 
				
			||||||
haven't have not
 | 
					haven't have n't
 | 
				
			||||||
he'd    he would
 | 
					he'd    he 'd
 | 
				
			||||||
he'd've he would have
 | 
					he'd've he 'd 've
 | 
				
			||||||
he'll   he will
 | 
					he'll   he 'll
 | 
				
			||||||
he's    he 's
 | 
					he's    he 's
 | 
				
			||||||
how'd   he would
 | 
					how'd   he 'd
 | 
				
			||||||
how'll  he will
 | 
					how'll  he 'll
 | 
				
			||||||
how's   how 's
 | 
					how's   how 's
 | 
				
			||||||
I'd I would
 | 
					I'd I 'd
 | 
				
			||||||
I'd've  I would have
 | 
					I'd've  I 'd 've
 | 
				
			||||||
I'll    I will
 | 
					I'll    I 'll
 | 
				
			||||||
I'm I am
 | 
					I'm I 'm
 | 
				
			||||||
I'ma    I will
 | 
					I'ma    I 'ma
 | 
				
			||||||
I've    I have
 | 
					I've    I 've
 | 
				
			||||||
isn't   is not
 | 
					isn't   is n't
 | 
				
			||||||
it'd    it would
 | 
					it'd    it 'd
 | 
				
			||||||
it'd've it would have
 | 
					it'd've it 'd 've
 | 
				
			||||||
it'll   it will
 | 
					it'll   it 'll
 | 
				
			||||||
it's    it 's
 | 
					it's    it 's
 | 
				
			||||||
let's   let 's
 | 
					let's   let 's
 | 
				
			||||||
mightn't    might not
 | 
					mightn't    might n't
 | 
				
			||||||
mightn't've might not have
 | 
					mightn't've might n't 've
 | 
				
			||||||
might've    might have
 | 
					might've    might 've
 | 
				
			||||||
mustn't must not
 | 
					mustn't must n't
 | 
				
			||||||
must've must have
 | 
					must've must 've
 | 
				
			||||||
needn't need not
 | 
					needn't need n't
 | 
				
			||||||
not've  not have
 | 
					not've  not 've
 | 
				
			||||||
shan't  shall not
 | 
					shan't  sha n't
 | 
				
			||||||
she'd   she would
 | 
					she'd   she 'd
 | 
				
			||||||
she'd've    she would have
 | 
					she'd've    she 'd 've
 | 
				
			||||||
she'll  she will
 | 
					she'll  she will
 | 
				
			||||||
she's   she 's
 | 
					she's   she 's
 | 
				
			||||||
should've   should have
 | 
					should've   should 've
 | 
				
			||||||
shouldn't   should not
 | 
					shouldn't   should n't
 | 
				
			||||||
shouldn't've    should not have
 | 
					shouldn't've    should n't 've
 | 
				
			||||||
that's  that 's
 | 
					that's  that 's
 | 
				
			||||||
there'd there would
 | 
					there'd there 'd
 | 
				
			||||||
there'd've  there would have
 | 
					there'd've  there 'd 've
 | 
				
			||||||
there's there is
 | 
					there's there 's
 | 
				
			||||||
they'd  there would
 | 
					they'd  there 'd
 | 
				
			||||||
they'd've   they would have
 | 
					they'd've   they 'd 've
 | 
				
			||||||
they'll they will
 | 
					they'll they 'll
 | 
				
			||||||
they're they are
 | 
					they're they 're
 | 
				
			||||||
they've they have
 | 
					they've they 've
 | 
				
			||||||
wasn't  was not
 | 
					wasn't  was n't
 | 
				
			||||||
we'd    we would
 | 
					we'd    we 'd
 | 
				
			||||||
we'd've we would have
 | 
					we'd've we 'd 've
 | 
				
			||||||
we'll   we will
 | 
					we'll   we 'll
 | 
				
			||||||
we're   we are
 | 
					we're   we 're
 | 
				
			||||||
we've   we have
 | 
					we've   we 've
 | 
				
			||||||
weren't were not
 | 
					weren't were n't
 | 
				
			||||||
what'll what will
 | 
					what'll what 'll
 | 
				
			||||||
what're what are
 | 
					what're what 're
 | 
				
			||||||
what's  what 's
 | 
					what's  what 's
 | 
				
			||||||
what've what have
 | 
					what've what 've
 | 
				
			||||||
when's  when 's
 | 
					when's  when 's
 | 
				
			||||||
where'd where would
 | 
					where'd where 'd
 | 
				
			||||||
where's where 's
 | 
					where's where 's
 | 
				
			||||||
where've    where have
 | 
					where've    where 've
 | 
				
			||||||
who'd   who would
 | 
					who'd   who 'd
 | 
				
			||||||
who'll  who will
 | 
					who'll  who 'll
 | 
				
			||||||
who're  who are
 | 
					who're  who 're
 | 
				
			||||||
who's   who 's
 | 
					who's   who 's
 | 
				
			||||||
who've  who have
 | 
					who've  who 've
 | 
				
			||||||
why'll  who will
 | 
					why'll  why 'll
 | 
				
			||||||
why're  why are
 | 
					why're  why 're
 | 
				
			||||||
why's   why 's
 | 
					why's   why 's
 | 
				
			||||||
won't   will not
 | 
					won't   wo n't
 | 
				
			||||||
would've    would have
 | 
					would've    would 've
 | 
				
			||||||
wouldn't    would not
 | 
					wouldn't    would n't
 | 
				
			||||||
wouldn't've would not have
 | 
					wouldn't've would n't 've
 | 
				
			||||||
you'd   you would
 | 
					you'd   you 'd
 | 
				
			||||||
you'd've    you would have
 | 
					you'd've    you 'd 've
 | 
				
			||||||
you'll  you will
 | 
					you'll  you 'll
 | 
				
			||||||
you're  you are
 | 
					you're  you 're
 | 
				
			||||||
you've  you have
 | 
					you've  you 've
 | 
				
			||||||
'em them
 | 
					'em 'em
 | 
				
			||||||
'ol old
 | 
					'ol 'ol
 | 
				
			||||||
10km    10 km
 | 
					10km    10 km
 | 
				
			||||||
U.S.    U.S.
 | 
					U.S.    U.S.
 | 
				
			||||||
 | 
					U.K.    U.K.
 | 
				
			||||||
non-U.S.    non-U.S.
 | 
					non-U.S.    non-U.S.
 | 
				
			||||||
U.N.    U.N.
 | 
					U.N.    U.N.
 | 
				
			||||||
Co. Co.
 | 
					Co. Co.
 | 
				
			||||||
| 
						 | 
					@ -115,7 +116,12 @@ A.G.    A.G.
 | 
				
			||||||
Rep.    Rep.
 | 
					Rep.    Rep.
 | 
				
			||||||
Ms. Ms.
 | 
					Ms. Ms.
 | 
				
			||||||
Mr. Mr.
 | 
					Mr. Mr.
 | 
				
			||||||
 | 
					Mrs.    Mrs.
 | 
				
			||||||
a.m.    a.m.
 | 
					a.m.    a.m.
 | 
				
			||||||
 | 
					Sen.    Sen.
 | 
				
			||||||
 | 
					INC.    INC.
 | 
				
			||||||
 | 
					CO. CO.
 | 
				
			||||||
 | 
					COS.    COS.
 | 
				
			||||||
p.m.    p.m.
 | 
					p.m.    p.m.
 | 
				
			||||||
Nos.    Nos.
 | 
					Nos.    Nos.
 | 
				
			||||||
a.k.a.  a.k.a.
 | 
					a.k.a.  a.k.a.
 | 
				
			||||||
| 
						 | 
					@ -127,6 +133,7 @@ E.  E.
 | 
				
			||||||
F.  F.
 | 
					F.  F.
 | 
				
			||||||
G.  G.
 | 
					G.  G.
 | 
				
			||||||
H.  H.
 | 
					H.  H.
 | 
				
			||||||
 | 
					I.  I.
 | 
				
			||||||
J.  J.
 | 
					J.  J.
 | 
				
			||||||
K.  K.
 | 
					K.  K.
 | 
				
			||||||
L.  L.
 | 
					L.  L.
 | 
				
			||||||
| 
						 | 
					@ -205,6 +212,9 @@ Wash.   Wash.
 | 
				
			||||||
W.Va.   W.Va.
 | 
					W.Va.   W.Va.
 | 
				
			||||||
Wis.    Wis.
 | 
					Wis.    Wis.
 | 
				
			||||||
Wyo.    Wyo.
 | 
					Wyo.    Wyo.
 | 
				
			||||||
 | 
					L.A.    L.A.
 | 
				
			||||||
 | 
					R.H.    R.H.
 | 
				
			||||||
 | 
					Gov.    Gov.
 | 
				
			||||||
''  ''
 | 
					''  ''
 | 
				
			||||||
:)  :)
 | 
					:)  :)
 | 
				
			||||||
<3  <3
 | 
					<3  <3
 | 
				
			||||||
| 
						 | 
					@ -262,3 +272,19 @@ V_V V_V
 | 
				
			||||||
o.O o.O
 | 
					o.O o.O
 | 
				
			||||||
")  ")
 | 
					")  ")
 | 
				
			||||||
....    ....
 | 
					....    ....
 | 
				
			||||||
 | 
					a-  a -
 | 
				
			||||||
 | 
					Messrs. Messrs.
 | 
				
			||||||
 | 
					No. No.
 | 
				
			||||||
 | 
					vs. vs.
 | 
				
			||||||
 | 
					Gen.    Gen.
 | 
				
			||||||
 | 
					Cos.    Cos.
 | 
				
			||||||
 | 
					L.J.    L.J.
 | 
				
			||||||
 | 
					D.T.    D.T.
 | 
				
			||||||
 | 
					Prof.   Prof.
 | 
				
			||||||
 | 
					Bros.   Bros.
 | 
				
			||||||
 | 
					J.C.    J.C.
 | 
				
			||||||
 | 
					Neb.    Neb.
 | 
				
			||||||
 | 
					Adm.    Adm.
 | 
				
			||||||
 | 
					U.S.S.R.    U.S.S.R.
 | 
				
			||||||
 | 
					Rev.    Rev.
 | 
				
			||||||
 | 
					H.F.    H.F.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user