Another lowly programmers blog: May 2014

Python: tokenize.py Updated

#!/usr/bin/env python

"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""

import re
import nltk.corpus

def tokenize(**kw):
    """ Tokenize string data """
    # Defaults
    kw.setdefault('text', None)
    kw.setdefault('min_len', 4)
    kw.setdefault('ignore_list', [])
    kw.setdefault('filter_stopwords', False)
    kw.setdefault('filter_plainwords', False)

    if  kw['text'] == None:
        return None

    if not kw['filter_stopwords'] == False:
        kw['ignore_list'] += nltk.corpus.stopwords.words()

    if  kw['filter_plainwords'] == True:
        plain_words = nltk.corpus.abc.words()
    else:
        plain_words = []

    matches = set(re.findall('([A-z]+)+', kw['text']))

    tokens = set([token.lower() for token in matches
                  if token not in kw['ignore_list']
                  and len(token) >= kw['min_len']])

    if not kw['filter_plainwords']:
        return tokens
    else:
        return tokens.intersection(plain_words)

def test():
    """ tokenize() test """
    import requests
    text = raw_input("path or url to string data > ")
    if not text.startswith('http'):
        text = open(text, 'r').read()
    else:
        text = requests.get(text).content
    print tokenize(text=text,
                   min_len=4,
                   ignore_list=['code'],
                   filter_plainwords=True,
                   filter_stopwords=True)


## TESTING tokenize
if __name__ == '__main__':
    test()

#!/usr/bin/env python

""" Simple rss to html converter """

__version__ = "0.0.2"
__author__ = "Ricky L Wilson"

from feedparser import parse as parsefeed
import StringIO

def entry2html(**kwargs):
    """ Base template for formating rss entries """
    template = u"""
    <span class='entry-title'>{title}</span>
    <a class='entry-link' href='{link}'>{title}</a>
    <span class='entry-description'>{description}</span>
    """
    return template.format(**kwargs).encode('utf-8')


def metaFormater(**kwargs):
    """ Format feed meta data """
    return u"""
    <span class='feed-title'>{title}</span>
    <span class='feed-date'>{date}</span>
    <span class='feed-description'>{description}</span>
    """.format(**kwargs).encode('utf-8')


def convert_feed(**kwargs):
    """ Convert a single rss feed to html """
    out = StringIO.StringIO("")
    url = kwargs['url']
    feedObj = parsefeed(url)
    feed = feedObj.feed
    entries = feedObj.entries
    print >>out, metaFormater(title=feed.title,
                        description=feed.description,
                        date=feed.date)

    for entry in entries:
        print >>out, entry2html(title=entry.title,
                                link=entry.link,
                                description=entry.description)
    return out.getvalue()


def convert_feeds(**kwargs):
    """ Convert multiple rss feeds to html """
    for url in kwargs['urls']:
        print convert_feed(url=url)



convert_feeds(urls=['http://stackoverflow.com/feeds',
                    'http://stackoverflow.com/feeds'])

Python rss to html conversion script

""" Simple rss to html converter """

__version__ = "0.0.1"
__author__ = "Ricky L Wilson"

from feedparser import parse as parsefeed
import StringIO


def entry2html(**kwargs):
    """ Format feedparser entry """
    title = kwargs['title'].encode('utf-8')
    link = kwargs['link'].encode('utf-8')
    description = kwargs['description'].encode('utf-8')
    template = """
    <h2 class='title'>{title}</h2>
    <a class='link' href='{link}'>{title}</a>
    <span class='description'>{description}</span>
    """
    return template.format(title=title, link=link, description=description)


def convert_feed(**kwargs):
    """ Main loop """
    out = StringIO.StringIO("")
    for entry in parsefeed(kwargs['url']).entries:
        title = entry['title']
        link = entry['link']
        description = entry['description']
        print >>out, entry2html(title=title, link=link,
                                description=description)
    return out.getvalue()

print convert_feed(url='http://stackoverflow.com/feeds')

Code Review

Python: tokenize.py Updated

rss2html 0.0.2

Python rss to html conversion script