Showing posts with label parser. Show all posts
Showing posts with label parser. Show all posts

A simple webpage parser with NLP included

#!/usr/python

#
# Base webpage parser
#

from bs4 import BeautifulSoup as bs
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus   import names, stopwords, brown
import nltk
import urlparse
from nltk.corpus import names

class PageParser:
  # Word lists
  stopwords = [this.lower() for this in list(stopwords.raw().split("\n"))]
  newswords = [this[0].lower() for this in list(brown.tagged_words(categories='news') )]
  names = [this.lower() for this in list(set(names.raw().split("\n"))) ]
  # Regular expersion for pulling links from markup
  linkRegex = re.compile('')
  titleRegex = re.compile('(.*?)', re.IGNORECASE|re.DOTALL)
  keywordRegex = re.compile('')
  
  def addStopwords(self,newWords):
    [ self.stopwords.append(this) for this in newWords ]
  
  # Remove parameters from a link
  def remove_parameters(self,link):
    return link.split("?")[0]
  
  def is_absolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0
  
  def is_relative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1
  
  def getLinkDepth(self,link):
    return urlparse.urlparse(link.strip("/")).path.count("/")
  
  def getWords(self,markup,Filter=None):
    if Filter == None:
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this not in self.stopwords \
                and re.match('([A-z ]+)+',this) ]).tokens))
    elif Filter == 'news':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
                for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if re.match('([A-z ]+)+',this.lower()) \
                and len(this) >=4 and this.lower() in self.newswords]).tokens))
    elif Filter == 'names':
      return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
              for this in nltk.word_tokenize(nltk.clean_html(markup)) \
                if len(this) >=4 and this.lower() not in self.stopwords \
                and re.match('([A-z ]+)+',this.lower()) and this.lower() in self.names ]).tokens))
  
  def resolve(self,domain,url):
      return urlparse.urljoin("http://%s"%(url),domain)
    
  def resolveLinks(self,urls,domain):
    return [self.resolve(self.remove_parameters(this.strip("/").strip("#")),domain) \
            for this in urls if this.startswith("/") \
            or this.startswith("http") and  self.is_relative(this) == True]
                
  def getLinks(self,markup,domain=None):
    if domain:
      return list(set(self.resolveLinks(self.linkRegex.findall(markup),domain)))
    else:
      return self.linkRegex.findall(markup)
    
  def getTitle(self,markup):
    return self.titleRegex.search(markup).group()
  
  def getKeywords(self,markup):
    return self.keywordRegex.findall(markup)

import requests
parser = PageParser()

def Test(testUrl):
  parser.addStopwords(['amp'])
  testDoc = requests.get(testUrl).content
  title = parser.getTitle(testDoc)
  keywords = parser.getKeywords(testDoc)
  text = parser.getWords(testDoc,'news')
  isAB = parser.is_absolute(testUrl)
  isRE = parser.is_relative(testUrl)
  depth = parser.getLinkDepth(testUrl)
  #links = parser.getLinks(testDoc,'www.cnn.com')
  print "LINK IS ABSOLUTE ? > %s" % (isAB)
  print "LINK IS RELATIVE ? > %s" % (isRE)
  print "LINK DEPTH:        > %s" % (depth)
  print "TITLE:    > %s" % (title)
  print "KEYWORDS: > %s" % (keywords)
  print "TEXT:     > %s" % (" ".join(text))
  #print "LINKS:    > %s" % ("\n".join(links))
Test("http://www.cnn.com")