#!/usr/bin/env python
"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""
import re
import nltk.corpus
def tokenize(**kw):
""" Tokenize string data """
# Defaults
kw.setdefault('text', None)
kw.setdefault('min_len', 4)
kw.setdefault('ignore_list', [])
kw.setdefault('filter_stopwords', False)
kw.setdefault('filter_plainwords', False)
if kw['text'] == None:
return None
if not kw['filter_stopwords'] == False:
kw['ignore_list'] += nltk.corpus.stopwords.words()
if kw['filter_plainwords'] == True:
plain_words = nltk.corpus.abc.words()
else:
plain_words = []
matches = set(re.findall('([A-z]+)+', kw['text']))
tokens = set([token.lower() for token in matches
if token not in kw['ignore_list']
and len(token) >= kw['min_len']])
if not kw['filter_plainwords']:
return tokens
else:
return tokens.intersection(plain_words)
def test():
""" tokenize() test """
import requests
text = raw_input("path or url to string data > ")
if not text.startswith('http'):
text = open(text, 'r').read()
else:
text = requests.get(text).content
print tokenize(text=text,
min_len=4,
ignore_list=['code'],
filter_plainwords=True,
filter_stopwords=True)
## TESTING tokenize
if __name__ == '__main__':
test()
More information on nltk can be found at www.nltk.org
A review of this script can be found here Code Review