import sys
import re
import urllib2
import urlparse
import requests
import socket
import threading
import gevent
from gevent import monkey
import time
monkey.patch_all(
socket=True,
dns=True,
time=True,
select=True,
thread=True,
os=True,
ssl=True,
httplib=False,
subprocess=False,
sys=False,
aggressive=True,
Event=False)
# The stack
tocrawl = set([sys.argv[1]])
crawled = set([])
linkregex = re.compile('')
def Update(links):
if links != None:
for link in (links.pop(0) for _ in xrange(len(links))):
link = ( "http://%s" %(urlparse.urlparse(link).netloc) )
if link not in crawled:
tocrawl.add(link)
def getLinks(crawling):
crawled.add(crawling)
try:
Update(linkregex.findall(requests.get(crawling).content))
except:
return None
def crawl():
try:
print"%d Threads running" % (threading.activeCount())
crawling = tocrawl.pop()
print crawling
print len(crawled)
walk = gevent.spawn(getLinks,crawling)
walk.run()
except:quit()
def dispatcher():
while True:
T = threading.Thread(target=crawl)
T.start()
time.sleep(1)
dispatcher()
Showing posts with label script. Show all posts
Showing posts with label script. Show all posts
A simple python web-crawler
Subscribe to:
Comments (Atom)