Showing posts with label script. Show all posts
Showing posts with label script. Show all posts

A simple python web-crawler

import sys
import re
import urllib2
import urlparse
import requests
import socket
import threading
import gevent
from gevent import monkey
import time

monkey.patch_all(
  socket=True,
  dns=True,
  time=True,
  select=True,
  thread=True,
  os=True,
  ssl=True,
  httplib=False,
  subprocess=False,
  sys=False,
  aggressive=True,
  Event=False)

# The  stack
tocrawl = set([sys.argv[1]])
crawled = set([])
linkregex = re.compile('')

def Update(links):
  if links != None:
    for link in (links.pop(0) for _ in xrange(len(links))):
      link = ( "http://%s" %(urlparse.urlparse(link).netloc) )
      if link not in crawled:
        tocrawl.add(link)

def getLinks(crawling):
  crawled.add(crawling)
  try:
    Update(linkregex.findall(requests.get(crawling).content))
  except:
    return None
    
def crawl():
  try:
    print"%d Threads running" % (threading.activeCount())
    crawling = tocrawl.pop()
    print crawling
    print len(crawled)
    walk = gevent.spawn(getLinks,crawling)
    walk.run()
  except:quit()
      
def dispatcher():
  while True:
    T = threading.Thread(target=crawl)
    T.start()
    time.sleep(1)

dispatcher()