#!/usr/python
#
# Base webpage parser
#
from bs4 import BeautifulSoup as bs
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import names, stopwords, brown
import nltk
import urlparse
from nltk.corpus import names
class PageParser:
# Word lists
stopwords = [this.lower() for this in list(stopwords.raw().split("\n"))]
newswords = [this[0].lower() for this in list(brown.tagged_words(categories='news') )]
names = [this.lower() for this in list(set(names.raw().split("\n"))) ]
# Regular expersion for pulling links from markup
linkRegex = re.compile('')
titleRegex = re.compile('(.*?)', re.IGNORECASE|re.DOTALL)
keywordRegex = re.compile('')
def addStopwords(self,newWords):
[ self.stopwords.append(this) for this in newWords ]
# Remove parameters from a link
def remove_parameters(self,link):
return link.split("?")[0]
def is_absolute(self,link):
return len(urlparse.urlparse(link).netloc) > 0
def is_relative(self,link):
return len(urlparse.urlparse(link).netloc) < 1
def getLinkDepth(self,link):
return urlparse.urlparse(link.strip("/")).path.count("/")
def getWords(self,markup,Filter=None):
if Filter == None:
return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
for this in nltk.word_tokenize(nltk.clean_html(markup)) \
if len(this) >=4 and this not in self.stopwords \
and re.match('([A-z ]+)+',this) ]).tokens))
elif Filter == 'news':
return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
for this in nltk.word_tokenize(nltk.clean_html(markup)) \
if re.match('([A-z ]+)+',this.lower()) \
and len(this) >=4 and this.lower() in self.newswords]).tokens))
elif Filter == 'names':
return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \
for this in nltk.word_tokenize(nltk.clean_html(markup)) \
if len(this) >=4 and this.lower() not in self.stopwords \
and re.match('([A-z ]+)+',this.lower()) and this.lower() in self.names ]).tokens))
def resolve(self,domain,url):
return urlparse.urljoin("http://%s"%(url),domain)
def resolveLinks(self,urls,domain):
return [self.resolve(self.remove_parameters(this.strip("/").strip("#")),domain) \
for this in urls if this.startswith("/") \
or this.startswith("http") and self.is_relative(this) == True]
def getLinks(self,markup,domain=None):
if domain:
return list(set(self.resolveLinks(self.linkRegex.findall(markup),domain)))
else:
return self.linkRegex.findall(markup)
def getTitle(self,markup):
return self.titleRegex.search(markup).group()
def getKeywords(self,markup):
return self.keywordRegex.findall(markup)
import requests
parser = PageParser()
def Test(testUrl):
parser.addStopwords(['amp'])
testDoc = requests.get(testUrl).content
title = parser.getTitle(testDoc)
keywords = parser.getKeywords(testDoc)
text = parser.getWords(testDoc,'news')
isAB = parser.is_absolute(testUrl)
isRE = parser.is_relative(testUrl)
depth = parser.getLinkDepth(testUrl)
#links = parser.getLinks(testDoc,'www.cnn.com')
print "LINK IS ABSOLUTE ? > %s" % (isAB)
print "LINK IS RELATIVE ? > %s" % (isRE)
print "LINK DEPTH: > %s" % (depth)
print "TITLE: > %s" % (title)
print "KEYWORDS: > %s" % (keywords)
print "TEXT: > %s" % (" ".join(text))
#print "LINKS: > %s" % ("\n".join(links))
Test("http://www.cnn.com")