Showing posts with label python. Show all posts
Showing posts with label python. Show all posts

Python: tokenize.py Updated

#!/usr/bin/env python

"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""

import re
import nltk.corpus

def tokenize(**kw):
    """ Tokenize string data """
    # Defaults
    kw.setdefault('text', None)
    kw.setdefault('min_len', 4)
    kw.setdefault('ignore_list', [])
    kw.setdefault('filter_stopwords', False)
    kw.setdefault('filter_plainwords', False)

    if  kw['text'] == None:
        return None

    if not kw['filter_stopwords'] == False:
        kw['ignore_list'] += nltk.corpus.stopwords.words()

    if  kw['filter_plainwords'] == True:
        plain_words = nltk.corpus.abc.words()
    else:
        plain_words = []

    matches = set(re.findall('([A-z]+)+', kw['text']))

    tokens = set([token.lower() for token in matches
                  if token not in kw['ignore_list']
                  and len(token) >= kw['min_len']])

    if not kw['filter_plainwords']:
        return tokens
    else:
        return tokens.intersection(plain_words)

def test():
    """ tokenize() test """
    import requests
    text = raw_input("path or url to string data > ")
    if not text.startswith('http'):
        text = open(text, 'r').read()
    else:
        text = requests.get(text).content
    print tokenize(text=text,
                   min_len=4,
                   ignore_list=['code'],
                   filter_plainwords=True,
                   filter_stopwords=True)


## TESTING tokenize
if __name__ == '__main__':
    test()
 

rss2html 0.0.2

#!/usr/bin/env python

""" Simple rss to html converter """

__version__ = "0.0.2"
__author__ = "Ricky L Wilson"

from feedparser import parse as parsefeed
import StringIO

def entry2html(**kwargs):
    """ Base template for formating rss entries """
    template = u"""
    <span class='entry-title'>{title}</span>
    <a class='entry-link' href='{link}'>{title}</a>
    <span class='entry-description'>{description}</span>
    """
    return template.format(**kwargs).encode('utf-8')


def metaFormater(**kwargs):
    """ Format feed meta data """
    return u"""
    <span class='feed-title'>{title}</span>
    <span class='feed-date'>{date}</span>
    <span class='feed-description'>{description}</span>
    """.format(**kwargs).encode('utf-8')


def convert_feed(**kwargs):
    """ Convert a single rss feed to html """
    out = StringIO.StringIO("")
    url = kwargs['url']
    feedObj = parsefeed(url)
    feed = feedObj.feed
    entries = feedObj.entries
    print >>out, metaFormater(title=feed.title,
                        description=feed.description,
                        date=feed.date)

    for entry in entries:
        print >>out, entry2html(title=entry.title,
                                link=entry.link,
                                description=entry.description)
    return out.getvalue()


def convert_feeds(**kwargs):
    """ Convert multiple rss feeds to html """
    for url in kwargs['urls']:
        print convert_feed(url=url)



convert_feeds(urls=['http://stackoverflow.com/feeds',
                    'http://stackoverflow.com/feeds'])

Python rss to html conversion script

""" Simple rss to html converter """

__version__ = "0.0.1"
__author__ = "Ricky L Wilson"

from feedparser import parse as parsefeed
import StringIO


def entry2html(**kwargs):
    """ Format feedparser entry """
    title = kwargs['title'].encode('utf-8')
    link = kwargs['link'].encode('utf-8')
    description = kwargs['description'].encode('utf-8')
    template = """
    <h2 class='title'>{title}</h2>
    <a class='link' href='{link}'>{title}</a>
    <span class='description'>{description}</span>
    """
    return template.format(title=title, link=link, description=description)


def convert_feed(**kwargs):
    """ Main loop """
    out = StringIO.StringIO("")
    for entry in parsefeed(kwargs['url']).entries:
        title = entry['title']
        link = entry['link']
        description = entry['description']
        print >>out, entry2html(title=title, link=link,
                                description=description)
    return out.getvalue()

print convert_feed(url='http://stackoverflow.com/feeds')
Code Review 

Python: Merg RSS Feeds

from datetime import datetime

tstart = datetime.now()
# Set the feed/channel level properties
# ----------------------------------- #
chanTitle = 'Feed Merger'
chanLink = 'http://server.com/feed'
chanAuthor = 'Bob Dylan'
chanDescription = 'Brain Food'
# ----------------------------------- #
# Apply feed/channel level properties
# ----------------------------------- #
feed = Feed()
feed.feed["title"] = chanTitle
feed.feed["link"] = chanLink
feed.feed["author"] = chanAuthor
feed.feed["description"] = chanDescription
# ----------------------------------- #
urls = list(set(open('urls.txt', 'r').readlines()))
shuffle(urls)
extract_entries = lambda url: feedparser.parse(url).entries
addEntries = lambda entries: [feed.items.append(entry) for entry in entries]
merg = lambda urls: [addEntries(extract_entries(url)) for url in urls]
shuffle(feed.entries)
save = lambda outfile: feed.format_rss2_file(outfile)
merg(urls)
save('feed.xml')
tend = datetime.now()
runtime = tend - tstart
print "Runtime > %s" % (runtime)
print "Merged  > %d items" % (len(feed.entries))

Scrape results from thefreedictionary.com

#!/usr/bin/python

from bs4 import BeautifulSoup as bs
import re
from requests import get


def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)


def get_soup(url):
    raw = remove_non_ascii(get(url).content)
    soup = bs(raw)
    return soup.select("#MainTxt")[0].select('.ds-single')[0].text.strip()


def lookup(word):
    base_url = "http://www.thefreedictionary.com/"
    query_url = (base_url + word)
    return get_soup(query_url)

if __name__ == '__main__':
    print lookup('linux')

A simple python web-crawler

import sys
import re
import urllib2
import urlparse
import requests
import socket
import threading
import gevent
from gevent import monkey
import time

monkey.patch_all(
  socket=True,
  dns=True,
  time=True,
  select=True,
  thread=True,
  os=True,
  ssl=True,
  httplib=False,
  subprocess=False,
  sys=False,
  aggressive=True,
  Event=False)

# The  stack
tocrawl = set([sys.argv[1]])
crawled = set([])
linkregex = re.compile('')

def Update(links):
  if links != None:
    for link in (links.pop(0) for _ in xrange(len(links))):
      link = ( "http://%s" %(urlparse.urlparse(link).netloc) )
      if link not in crawled:
        tocrawl.add(link)

def getLinks(crawling):
  crawled.add(crawling)
  try:
    Update(linkregex.findall(requests.get(crawling).content))
  except:
    return None
    
def crawl():
  try:
    print"%d Threads running" % (threading.activeCount())
    crawling = tocrawl.pop()
    print crawling
    print len(crawled)
    walk = gevent.spawn(getLinks,crawling)
    walk.run()
  except:quit()
      
def dispatcher():
  while True:
    T = threading.Thread(target=crawl)
    T.start()
    time.sleep(1)

dispatcher()

A simple webpage parser with NLP included

#!/usr/python

#
# Base webpage parser
#

from bs4 import BeautifulSoup as bs
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus   import names, stopwords, brown
import nltk
import urlparse
from nltk.corpus import names

class PageParser:
  # Word lists
  stopwords = [this.lower() for this in list(stopwords.raw().split("\n"))]
  newswords = [this[0].lower() for this in list(brown.tagged_words(categories='news') )]
  names = [this.lower() for this in list(set(names.raw().split("\n"))) ]
  # Regular expersion for pulling links from markup
  linkRegex = re.compile('')
  titleRegex = re.compile('(.*?)', re.IGNORECASE|re.DOTALL)
  keywordRegex = re.compile('')
  
  def addStopwords(self,newWords):
    [ self.stopwords.append(this) for this in newWords ]
  
  # Remove parameters from a link
  def remove_parameters(self,link):
    return link.split("?")[0]
  
  def is_absolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0
  
  def is_relative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1
  
  def getLinkDepth(self,link):
    return urlparse.urlparse(link.strip("/")).path.count("/")
  
  def getWords(self,markup,Filter=None):
    if Filter == None:
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this not in self.stopwords \
                and re.match('([A-z ]+)+',this) ]).tokens))
    elif Filter == 'news':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if re.match('([A-z ]+)+',this.lower()) \
                and len(this) >=4 and this.lower() in self.newswords]).tokens))
    elif Filter == 'names':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
              for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this.lower() not in self.stopwords \
                and re.match('([A-z ]+)+',this.lower()) and this.lower() in self.names ]).tokens))
  
  def resolve(self,domain,url):
      return urlparse.urljoin("http://%s"%(url),domain)
    
  def resolveLinks(self,urls,domain):
    return [self.resolve(self.remove_parameters(this.strip("/").strip("#")),domain) \
            for this in urls if this.startswith("/") \
            or this.startswith("http") and  self.is_relative(this) == True]
                
  def getLinks(self,markup,domain=None):
    if domain:
      return list(set(self.resolveLinks(self.linkRegex.findall(markup),domain)))
    else:
      return self.linkRegex.findall(markup)
    
  def getTitle(self,markup):
    return self.titleRegex.search(markup).group()
  
  def getKeywords(self,markup):
    return self.keywordRegex.findall(markup)

import requests
parser = PageParser()

def Test(testUrl):
  parser.addStopwords(['amp'])
  testDoc = requests.get(testUrl).content
  title = parser.getTitle(testDoc)
  keywords = parser.getKeywords(testDoc)
  text = parser.getWords(testDoc,'news')
  isAB = parser.is_absolute(testUrl)
  isRE = parser.is_relative(testUrl)
  depth = parser.getLinkDepth(testUrl)
  #links = parser.getLinks(testDoc,'www.cnn.com')
  print "LINK IS ABSOLUTE ? > %s" % (isAB)
  print "LINK IS RELATIVE ? > %s" % (isRE)
  print "LINK DEPTH:        > %s" % (depth)
  print "TITLE:    > %s" % (title)
  print "KEYWORDS: > %s" % (keywords)
  print "TEXT:     > %s" % (" ".join(text))
  #print "LINKS:    > %s" % ("\n".join(links))
Test("http://www.cnn.com")