#!/usr/python # # Base webpage parser # from bs4 import BeautifulSoup as bs import re from nltk.tokenize import wordpunct_tokenize from nltk.corpus import names, stopwords, brown import nltk import urlparse from nltk.corpus import names class PageParser: # Word lists stopwords = [this.lower() for this in list(stopwords.raw().split("\n"))] newswords = [this[0].lower() for this in list(brown.tagged_words(categories='news') )] names = [this.lower() for this in list(set(names.raw().split("\n"))) ] # Regular expersion for pulling links from markup linkRegex = re.compile('') titleRegex = re.compile('(.*?) ', re.IGNORECASE|re.DOTALL) keywordRegex = re.compile('') def addStopwords(self,newWords): [ self.stopwords.append(this) for this in newWords ] # Remove parameters from a link def remove_parameters(self,link): return link.split("?")[0] def is_absolute(self,link): return len(urlparse.urlparse(link).netloc) > 0 def is_relative(self,link): return len(urlparse.urlparse(link).netloc) < 1 def getLinkDepth(self,link): return urlparse.urlparse(link.strip("/")).path.count("/") def getWords(self,markup,Filter=None): if Filter == None: return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \ for this in nltk.word_tokenize(nltk.clean_html(markup)) \ if len(this) >=4 and this not in self.stopwords \ and re.match('([A-z ]+)+',this) ]).tokens)) elif Filter == 'news': return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \ for this in nltk.word_tokenize(nltk.clean_html(markup)) \ if re.match('([A-z ]+)+',this.lower()) \ and len(this) >=4 and this.lower() in self.newswords]).tokens)) elif Filter == 'names': return list(set(nltk.Text([ re.sub("([\'\"\`])",'',this.lower()) \ for this in nltk.word_tokenize(nltk.clean_html(markup)) \ if len(this) >=4 and this.lower() not in self.stopwords \ and re.match('([A-z ]+)+',this.lower()) and this.lower() in self.names ]).tokens)) def resolve(self,domain,url): return urlparse.urljoin("http://%s"%(url),domain) def resolveLinks(self,urls,domain): return [self.resolve(self.remove_parameters(this.strip("/").strip("#")),domain) \ for this in urls if this.startswith("/") \ or this.startswith("http") and self.is_relative(this) == True] def getLinks(self,markup,domain=None): if domain: return list(set(self.resolveLinks(self.linkRegex.findall(markup),domain))) else: return self.linkRegex.findall(markup) def getTitle(self,markup): return self.titleRegex.search(markup).group() def getKeywords(self,markup): return self.keywordRegex.findall(markup) import requests parser = PageParser() def Test(testUrl): parser.addStopwords(['amp']) testDoc = requests.get(testUrl).content title = parser.getTitle(testDoc) keywords = parser.getKeywords(testDoc) text = parser.getWords(testDoc,'news') isAB = parser.is_absolute(testUrl) isRE = parser.is_relative(testUrl) depth = parser.getLinkDepth(testUrl) #links = parser.getLinks(testDoc,'www.cnn.com') print "LINK IS ABSOLUTE ? > %s" % (isAB) print "LINK IS RELATIVE ? > %s" % (isRE) print "LINK DEPTH: > %s" % (depth) print "TITLE: > %s" % (title) print "KEYWORDS: > %s" % (keywords) print "TEXT: > %s" % (" ".join(text)) #print "LINKS: > %s" % ("\n".join(links)) Test("http://www.cnn.com")
Showing posts with label parser. Show all posts
Showing posts with label parser. Show all posts
A simple webpage parser with NLP included
Subscribe to:
Posts (Atom)