import sys import re import urllib2 import urlparse import requests import socket import threading import gevent from gevent import monkey import time monkey.patch_all( socket=True, dns=True, time=True, select=True, thread=True, os=True, ssl=True, httplib=False, subprocess=False, sys=False, aggressive=True, Event=False) # The stack tocrawl = set([sys.argv[1]]) crawled = set([]) linkregex = re.compile('') def Update(links): if links != None: for link in (links.pop(0) for _ in xrange(len(links))): link = ( "http://%s" %(urlparse.urlparse(link).netloc) ) if link not in crawled: tocrawl.add(link) def getLinks(crawling): crawled.add(crawling) try: Update(linkregex.findall(requests.get(crawling).content)) except: return None def crawl(): try: print"%d Threads running" % (threading.activeCount()) crawling = tocrawl.pop() print crawling print len(crawled) walk = gevent.spawn(getLinks,crawling) walk.run() except:quit() def dispatcher(): while True: T = threading.Thread(target=crawl) T.start() time.sleep(1) dispatcher()
Showing posts with label web-crawler. Show all posts
Showing posts with label web-crawler. Show all posts
A simple python web-crawler
Subscribe to:
Posts (Atom)