U
    Be#D  ã                   @   sJ   d Z ddlZddlmZmZmZmZmZmZm	Z	 dd Z
G dd dZdS )zC
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
é    N)ÚpunktÚword_tokenizeÚTweetTokenizerÚStanfordSegmenterÚTreebankWordTokenizerÚSyllableTokenizerÚLegalitySyllableTokenizerc              
   C   s   dd l }zt }| dĄ | dĄ W n4 tk
rZ } z| dt| Ą W 5 d }~X Y nX z
t  W n tk
r   | dĄ Y nX d S )Nr   ÚarÚzhz6Tests for nltk.tokenize.stanford_segmenter skipped: %sz]Tests for nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist)Úpytestr   Údefault_configÚLookupErrorÚskipÚstrZStanfordTokenizer)Úmoduler   ÚsegÚeŠ r   úx/var/www/nmhs-web.org.in/public_html/infoladakh/backend/venv/lib/python3.8/site-packages/nltk/test/unit/test_tokenize.pyÚsetup_module   s    

˙
˙r   c                   @   s   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!S )"ÚTestTokenizec              
   C   sB   t ddd}d}| |Ą}ddddddd	d
ddg
}||ks>tdS )zW
        Test TweetTokenizer using words with special and accented characters.
        T)Ústrip_handlesZ
reduce_lenuA   @myke: Let's test these words: resumÃŠ EspaÃąa MÃŧnchen franÃ§aisú:zLet'sÚtestZtheseÚwordsu   resumÃŠu   EspaÃąau   MÃŧnchenu	   franÃ§aisNŠr   ÚtokenizeÚAssertionError)ÚselfÚ	tokenizerZs9ÚtokensÚexpectedr   r   r   Útest_tweet_tokenizer'   s    
öz!TestTokenize.test_tweet_tokenizerc                 C   s*   t  }| dĄ}|dddddgks&tdS )z3
        Test SyllableTokenizer tokenizer.
        ZjustificationZjusÚtiÚfiÚcaZtionN)r   r   r   )r   r   r    r   r   r   Ú+test_sonority_sequencing_syllable_tokenizer=   s    
z8TestTokenize.test_sonority_sequencing_syllable_tokenizerc                 C   s<   ddl m} d}t| Ą }| |Ą}|dddgks8tdS )z;
        Test LegalitySyllableTokenizer tokenizer.
        r   )r   Z	wonderfulZwonZderZfulN)Znltk.corpusr   r   r   r   )r   r   Z	test_wordr   r    r   r   r   Ú*test_legality_principle_syllable_tokenizerE   s
    
z7TestTokenize.test_legality_principle_syllable_tokenizerc                 C   s   zNt  }| dĄ d}| | Ą Ą}| Ą ddddddd	d
ddddgksLtW n0 tk
r~ } zt t|Ą W 5 d}~X Y nX dS )zN
        Test the Stanford Word Segmenter for Arabic (default config)
        r	   un   ŲØ¨Ø­ØĢ ØšŲŲ Ø§ŲØ­Ø§ØŗŲØ¨ Ø§ØŗØĒØŽØ¯Ø§Ų Ø§ŲØ­ŲØŗØ¨ØŠ Ø¨ØŦŲŲØš Ø§Ø´ŲØ§ŲŲØ§ ŲØ­Ų Ø§ŲŲØ´ŲŲØ§ØĒu   ŲØ¨Ø­ØĢu   ØšŲŲu   Ø§ŲØ­Ø§ØŗŲØ¨u   Ø§ØŗØĒØŽØ¯Ø§Ųu   Ø§ŲØ­ŲØŗØ¨ØŠu   Ø¨u   ØŦŲŲØšu
   Ø§Ø´ŲØ§Ųu   ŲØ§u   Ųu   Ø­Ųu   Ø§ŲŲØ´ŲŲØ§ØĒNŠ	r   r   ÚsegmentÚsplitr   r   r   r   r   Šr   r   ÚsentZsegmented_sentr   r   r   r   Útest_stanford_segmenter_arabicO   s*    
ôz+TestTokenize.test_stanford_segmenter_arabicc              
   C   sx   zBt  }| dĄ d}| | Ą Ą}| Ą ddddddgks@tW n0 tk
rr } zt t|Ą W 5 d	}~X Y nX d	S )
zO
        Test the Stanford Word Segmenter for Chinese (default config)
        r
   u$   čŋæ¯æ¯åĻįĻä¸­æåč¯å¨æĩč¯u   čŋu   æ¯u	   æ¯åĻįĻu   ä¸­æu	   åč¯å¨u   æĩč¯Nr(   r+   r   r   r   Útest_stanford_segmenter_chinesei   s    
 z,TestTokenize.test_stanford_segmenter_chinesec                 C   sP   t  }d}dg}| |Ą}||ks&td}ddddg}| |Ą}||ksLtdS )zT
        Test a string that resembles a phone number but contains a newline
        z(393)  928 -3010z(393)
928 -3010ú(Z393ú)z	928 -3010Nr   )r   r   Útest1r!   ÚresultÚtest2r   r   r   Útest_phone_tokenizerv   s    

z!TestTokenize.test_phone_tokenizerc                 C   s6   d}dddddddd	dd
dddg}t ||ks2tdS )zA
        Test padding of asterisk for word tokenization.
        z1This is a, *weird sentence with *asterisks in it.ZThisÚisÚaú,Ú*ZweirdÚsentenceÚwithZ	asterisksÚinÚitÚ.NŠr   r   Šr   Útextr!   r   r   r   Útest_pad_asterisk   s         ˙zTestTokenize.test_pad_asteriskc                 C   s<   d}dddddddd	d
dddddddg}t ||ks8tdS )z@
        Test padding of dotdot* for word tokenization.
        zPWhy did dotdot.. not get tokenized but dotdotdot... did? How about manydots.....ZWhyZdidÚdotdotz..ÚnotÚgetZ	tokenizedÚbutZ	dotdotdotz...ú?ZHowZaboutZmanydotsz.....Nr>   r?   r   r   r   Útest_pad_dotdot   s            ūzTestTokenize.test_pad_dotdotc              &   C   sÖ  t dd}d}dddg}| |Ą}||ks.td}dd	d
ddddddddddddddddddddddddd dg}| |Ą}||kstd!}d"d#d$d#d%d#d&d#d'd#d(d#d)d#d*d#d+d#d,d#d-d#d.d#d/d#d0d#d1d#d2d#d3d#d4d#d5d#g&}| |Ą}||ksđtd6}d/d"d1d"d2d"d3d"d4d"d5d"g}| |Ą}||ks(td7}d/d#d1d#d2d#d3d#d4d#d5d#d#d#d0d#d#d0d#d8d#d9d#d:d#g}| |Ą}||ksztd;}	d<d=d.d>g}| |	Ą}||ksĸtd?}
d@dAdBdAd.dAdCdAg}| |
Ą}||ksŌtdDS )EzW
        Test remove_handle() from casual.py with specially crafted edge cases
        T)r   z-@twitter hello @twi_tter_. hi @12345 @123newsZhellor=   Úhiu]   @n`@n~@n(@n)@n-@n=@n+@n\@n|@n[@n]@n{@n}@n;@n:@n'@n"@n/@n?@n.@n,@n<@n>@n @n
@n Ãą@n.Ãŧ@n.Ã§@n.ú`ú~r/   r0   ú-ú=ú+ú\ú|ú[ú]Ú{Ú}ú;r   ú'ú"ú/rF   r7   ú<ú>õ   Ãąõ   Ãŧõ   Ã§zKa@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@nr6   z@nÚjÚzÚAÚLÚZÚ1Ú4Ú7Ú9Ú0Ú_ú!ú@ú#ú$ú%ú&r8   z@n!a @n#a @n$a @n%a @n&a @n*azD@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@nz@n_z@n7z@njzm@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandleZuvwxyzÚ1234Zendofhandlezr@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcdeÚuz@abcdez@abcdefghijklmnopqrstÚ5Nr   )r   r   r1   r!   r2   r3   Útest3Ztest4Ztest5Ztest6Ztest7r   r   r   Útest_remove_handle   s   


ã
Ú(

į

ø

zTestTokenize.test_remove_handlec                 C   sú   t  }d}dddddddd	d
ddddddddddddddg}t| |Ą}||ksVtd}ddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/g}t| |Ą}||ks¤td0}ddddddd d!d"d#d$d%d&d'd(d)d*d1d2d3d4d5d6d7g}t| |Ą}||ksötd8S )9zC
        Test TreebankWordTokenizer.span_tokenize function
        zNGood muffins cost $3.88
in New (York).  Please (buy) me
two of them.
(Thanks).)r   é   )é   é   )é   é   )é   é   )ry   é   )é   é   )é   é   )é   é    )r   é$   )r   é%   )r   é&   )é(   é.   )é/   é0   )r   é3   )r   é4   )é5   é7   )é8   é;   )é<   é>   )é?   éD   )éE   éF   )r   éL   )r   éM   )r   éN   zmThe DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues)r   é   )rs   é   )é   é
   )é   rx   )ry   é   )é   é   )r|   r}   )r}   r   )r   é*   )r   é+   )é,   r   )r   é2   )r   é9   )é:   é@   )éA   r   )r   éJ   )éK   r   )r   éU   )éV   é\   )é]   é_   )é`   éf   )ég   ém   zqThe DUP is similar to the "religious right" in the United States and takes a ``hardline'' stance on social issues)r   éO   )r˛   éW   )rŗ   éY   )éZ   rŽ   )éa   éc   )éd   éj   )ék   éq   N)r   ÚlistZspan_tokenizer   )r   r   r1   r!   r2   r3   rq   r   r   r   Útest_treebank_span_tokenizer5  s¤    éęčz)TestTokenize.test_treebank_span_tokenizerc                 C   s\   d}ddddddddd	d
dddddg}t ||ks6td}dddddg}t ||ksXtdS )z-
        Test word_tokenize function
        z0The 'v', I've been fooled but I'll seek revenge.ZTherU   Úvr7   ÚIz'veZbeenZfooledrE   z'llÚseekZrevenger=   z'v' 're'z'reNr>   )r   r9   r!   r   r   r   Útest_word_tokenize  s         ˙zTestTokenize.test_word_tokenizec                 C   sZ   dddgfddddgfddddd	gfg}|D ](\}}d
d t  |ĄD }||ks,tq,d S )NZ12)rb   Ú2)rÂ   NZ123)rÂ   Ú3)rÃ   Nrn   )rÃ   rc   )rc   Nc                 S   s   g | ]}|qS r   r   )Ú.0Úxr   r   r   Ú
<listcomp>­  s     z5TestTokenize.test_punkt_pair_iter.<locals>.<listcomp>)r   Ú
_pair_iterr   )r   Z
test_casesZ
test_inputZexpected_outputZactual_outputr   r   r   Útest_punkt_pair_iter¤  s    
ũz!TestTokenize.test_punkt_pair_iterc                 C   s   t g }t |Ą}t| d S ŠN)Úiterr   rĮ   rŧ   )r   r<   Úgenr   r   r   Ú5test_punkt_pair_iter_handles_stop_iteration_exceptioną  s    
zBTestTokenize.test_punkt_pair_iter_handles_stop_iteration_exceptionc                 C   s0   t  Ą }G dd d}| |_t| dĄ d S )Nc                   @   s   e Zd Zdd ZdS )zkTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMockc                 S   s   t g S rÉ   )rĘ   )r   Úsr   r   r   r   Ŋ  s    zyTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMock.word_tokenizeN)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   ÚTestPunktTokenizeWordsMockŧ  s   rŅ   r   )r   ZPunktBaseClassZ
_lang_varsrŧ   Z_tokenize_words)r   ÚobjrŅ   r   r   r   Ú:test_punkt_tokenize_words_handles_stop_iteration_exceptionš  s    zGTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exceptionc                 C   sD   G dd dt j}t j| d}d}dddg}| |Ą|ks@td S )Nc                   @   s   e Zd ZdZdS )zNTestTokenize.test_punkt_tokenize_custom_lang_vars.<locals>.BengaliLanguageVars)r=   rF   rh   u   āĨ¤N)rÎ   rĪ   rĐ   Zsent_end_charsr   r   r   r   ÚBengaliLanguageVarsĮ  s   rÔ   )Z	lang_varsõc  āĻāĻĒāĻ°āĻžāĻˇā§āĻā§āĻ°āĻĒāĻ¤āĻŋ āĻļā§āĻ°ā§ āĻāĻŽ āĻ­ā§āĻā§āĻāĻžāĻā§āĻž āĻ¨āĻžāĻāĻĄā§ āĻ¸ā§āĻŽāĻŦāĻžāĻ° āĻāĻ āĻāĻ āĻāĻŋ āĻĻāĻŋāĻ˛ā§āĻ˛āĻŋāĻ° āĻšā§āĻ°āĻ āĻā§āĻ¨ā§āĻ¤ā§ āĻāĻĻāĻ¯āĻžāĻĒāĻ¨ā§āĻ° āĻāĻĻā§āĻŦā§āĻ§āĻ¨ āĻāĻ°ā§āĻā§āĻ¨āĨ¤ āĻāĻ¨āĻ˛āĻžāĻāĻ¨ā§āĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§ āĻāĻ āĻāĻ¨ā§āĻˇā§āĻ āĻžāĻ¨ā§ āĻā§āĻ¨ā§āĻĻā§āĻ°ā§ā§ āĻŽāĻžāĻ¨āĻŦ āĻ¸āĻŽā§āĻĒāĻĻ āĻāĻ¨ā§āĻ¨ā§āĻ¨āĻŽāĻ¨ā§āĻ¤ā§āĻ°ā§ āĻļā§āĻ°ā§ āĻ°āĻŽā§āĻļ āĻĒā§āĻāĻ°āĻŋā§āĻžāĻ˛ âāĻ¨āĻŋāĻļāĻžāĻā§āĻâ  āĻāĻĒāĻ¸ā§āĻĨāĻŋāĻ¤ āĻāĻŋāĻ˛ā§āĻ¨āĨ¤ āĻāĻ āĻāĻĒāĻ˛āĻā§āĻˇā§āĻ¯ā§ āĻāĻĒāĻ°āĻžāĻˇā§āĻā§āĻ°āĻĒāĻ¤āĻŋ āĻšā§āĻ°āĻāĻā§āĻ¨ā§āĻ¤ā§āĻ° āĻ˛ā§āĻā§ āĻāĻŦāĻ ā§¨ā§Ļā§Šā§Ļ-āĻāĻ° āĻāĻ¨ā§āĻ¯ āĻĒā§āĻ°āĻ¤āĻŋāĻˇā§āĻ āĻžāĻ¨ā§āĻ° āĻ˛āĻā§āĻˇā§āĻ¯ āĻ āĻĒāĻ°āĻŋāĻāĻ˛ā§āĻĒāĻ¨āĻžāĻ° āĻ¨āĻĨāĻŋ āĻĒā§āĻ°āĻāĻžāĻļ āĻāĻ°ā§āĻā§āĻ¨āĨ¤u  āĻāĻĒāĻ°āĻžāĻˇā§āĻā§āĻ°āĻĒāĻ¤āĻŋ āĻļā§āĻ°ā§ āĻāĻŽ āĻ­ā§āĻā§āĻāĻžāĻā§āĻž āĻ¨āĻžāĻāĻĄā§ āĻ¸ā§āĻŽāĻŦāĻžāĻ° āĻāĻ āĻāĻ āĻāĻŋ āĻĻāĻŋāĻ˛ā§āĻ˛āĻŋāĻ° āĻšā§āĻ°āĻ āĻā§āĻ¨ā§āĻ¤ā§ āĻāĻĻāĻ¯āĻžāĻĒāĻ¨ā§āĻ° āĻāĻĻā§āĻŦā§āĻ§āĻ¨ āĻāĻ°ā§āĻā§āĻ¨āĨ¤u+  āĻāĻ¨āĻ˛āĻžāĻāĻ¨ā§āĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§ āĻāĻ āĻāĻ¨ā§āĻˇā§āĻ āĻžāĻ¨ā§ āĻā§āĻ¨ā§āĻĻā§āĻ°ā§ā§ āĻŽāĻžāĻ¨āĻŦ āĻ¸āĻŽā§āĻĒāĻĻ āĻāĻ¨ā§āĻ¨ā§āĻ¨āĻŽāĻ¨ā§āĻ¤ā§āĻ°ā§ āĻļā§āĻ°ā§ āĻ°āĻŽā§āĻļ āĻĒā§āĻāĻ°āĻŋā§āĻžāĻ˛ âāĻ¨āĻŋāĻļāĻžāĻā§āĻâ  āĻāĻĒāĻ¸ā§āĻĨāĻŋāĻ¤ āĻāĻŋāĻ˛ā§āĻ¨āĨ¤u/  āĻāĻ āĻāĻĒāĻ˛āĻā§āĻˇā§āĻ¯ā§ āĻāĻĒāĻ°āĻžāĻˇā§āĻā§āĻ°āĻĒāĻ¤āĻŋ āĻšā§āĻ°āĻāĻā§āĻ¨ā§āĻ¤ā§āĻ° āĻ˛ā§āĻā§ āĻāĻŦāĻ ā§¨ā§Ļā§Šā§Ļ-āĻāĻ° āĻāĻ¨ā§āĻ¯ āĻĒā§āĻ°āĻ¤āĻŋāĻˇā§āĻ āĻžāĻ¨ā§āĻ° āĻ˛āĻā§āĻˇā§āĻ¯ āĻ āĻĒāĻ°āĻŋāĻāĻ˛ā§āĻĒāĻ¨āĻžāĻ° āĻ¨āĻĨāĻŋ āĻĒā§āĻ°āĻāĻžāĻļ āĻāĻ°ā§āĻā§āĻ¨āĨ¤)r   ZPunktLanguageVarsÚPunktSentenceTokenizerr   r   )r   rÔ   rŌ   Ú	sentencesr!   r   r   r   Ú$test_punkt_tokenize_custom_lang_varsÄ  s
    
z1TestTokenize.test_punkt_tokenize_custom_lang_varsc                 C   s(   t  Ą }d}dg}| |Ą|ks$td S )NrÕ   )r   rÖ   r   r   )r   rŌ   r×   r!   r   r   r   Ú'test_punkt_tokenize_no_custom_lang_varsŅ  s    z4TestTokenize.test_punkt_tokenize_no_custom_lang_varsN)rÎ   rĪ   rĐ   r"   r&   r'   r-   r.   r4   rA   rG   rr   rŊ   rÁ   rČ   rĖ   rĶ   rØ   rŲ   r   r   r   r   r   &   s"   
	
 ar   )Ú__doc__r   Znltk.tokenizer   r   r   r   r   r   r   r   r   r   r   r   r   Ú<module>   s   $
