Showing posts with label NLP. Show all posts
Showing posts with label NLP. Show all posts

Python: tokenize.py Updated

#!/usr/bin/env python

"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""

import re
import nltk.corpus

def tokenize(**kw):
    """ Tokenize string data """
    # Defaults
    kw.setdefault('text', None)
    kw.setdefault('min_len', 4)
    kw.setdefault('ignore_list', [])
    kw.setdefault('filter_stopwords', False)
    kw.setdefault('filter_plainwords', False)

    if  kw['text'] == None:
        return None

    if not kw['filter_stopwords'] == False:
        kw['ignore_list'] += nltk.corpus.stopwords.words()

    if  kw['filter_plainwords'] == True:
        plain_words = nltk.corpus.abc.words()
    else:
        plain_words = []

    matches = set(re.findall('([A-z]+)+', kw['text']))

    tokens = set([token.lower() for token in matches
                  if token not in kw['ignore_list']
                  and len(token) >= kw['min_len']])

    if not kw['filter_plainwords']:
        return tokens
    else:
        return tokens.intersection(plain_words)

def test():
    """ tokenize() test """
    import requests
    text = raw_input("path or url to string data > ")
    if not text.startswith('http'):
        text = open(text, 'r').read()
    else:
        text = requests.get(text).content
    print tokenize(text=text,
                   min_len=4,
                   ignore_list=['code'],
                   filter_plainwords=True,
                   filter_stopwords=True)


## TESTING tokenize
if __name__ == '__main__':
    test()
 

A simple webpage parser with NLP included

#!/usr/python

#
# Base webpage parser
#

from bs4 import BeautifulSoup as bs
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus   import names, stopwords, brown
import nltk
import urlparse
from nltk.corpus import names

class PageParser:
  # Word lists
  stopwords = [this.lower() for this in list(stopwords.raw().split("\n"))]
  newswords = [this[0].lower() for this in list(brown.tagged_words(categories='news') )]
  names = [this.lower() for this in list(set(names.raw().split("\n"))) ]
  # Regular expersion for pulling links from markup
  linkRegex = re.compile('')
  titleRegex = re.compile('(.*?)', re.IGNORECASE|re.DOTALL)
  keywordRegex = re.compile('')
  
  def addStopwords(self,newWords):
    [ self.stopwords.append(this) for this in newWords ]
  
  # Remove parameters from a link
  def remove_parameters(self,link):
    return link.split("?")[0]
  
  def is_absolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0
  
  def is_relative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1
  
  def getLinkDepth(self,link):
    return urlparse.urlparse(link.strip("/")).path.count("/")
  
  def getWords(self,markup,Filter=None):
    if Filter == None:
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this not in self.stopwords \
                and re.match('([A-z ]+)+',this) ]).tokens))
    elif Filter == 'news':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if re.match('([A-z ]+)+',this.lower()) \
                and len(this) >=4 and this.lower() in self.newswords]).tokens))
    elif Filter == 'names':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
              for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this.lower() not in self.stopwords \
                and re.match('([A-z ]+)+',this.lower()) and this.lower() in self.names ]).tokens))
  
  def resolve(self,domain,url):
      return urlparse.urljoin("http://%s"%(url),domain)
    
  def resolveLinks(self,urls,domain):
    return [self.resolve(self.remove_parameters(this.strip("/").strip("#")),domain) \
            for this in urls if this.startswith("/") \
            or this.startswith("http") and  self.is_relative(this) == True]
                
  def getLinks(self,markup,domain=None):
    if domain:
      return list(set(self.resolveLinks(self.linkRegex.findall(markup),domain)))
    else:
      return self.linkRegex.findall(markup)
    
  def getTitle(self,markup):
    return self.titleRegex.search(markup).group()
  
  def getKeywords(self,markup):
    return self.keywordRegex.findall(markup)

import requests
parser = PageParser()

def Test(testUrl):
  parser.addStopwords(['amp'])
  testDoc = requests.get(testUrl).content
  title = parser.getTitle(testDoc)
  keywords = parser.getKeywords(testDoc)
  text = parser.getWords(testDoc,'news')
  isAB = parser.is_absolute(testUrl)
  isRE = parser.is_relative(testUrl)
  depth = parser.getLinkDepth(testUrl)
  #links = parser.getLinks(testDoc,'www.cnn.com')
  print "LINK IS ABSOLUTE ? > %s" % (isAB)
  print "LINK IS RELATIVE ? > %s" % (isRE)
  print "LINK DEPTH:        > %s" % (depth)
  print "TITLE:    > %s" % (title)
  print "KEYWORDS: > %s" % (keywords)
  print "TEXT:     > %s" % (" ".join(text))
  #print "LINKS:    > %s" % ("\n".join(links))
Test("http://www.cnn.com")