Search by Image, Recursively, Transparent PNG, #1
Search by Image, Recursively, Transparent PNG, #1 by Sebastian Schmieg:
import re, subprocess, time
class GoogleSearchByImage :
GOOGLE_URL = “http://www.google.com”
GOOGLE_SBI_URL = “/searchbyimage?image_url=”
AGENT_ID = “Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1”
MIN_SECONDS_BETWEEN_REQUESTS = 2
_myLastRequestTimestamp = 0
_myCurrentHtml = “”
def scrape(self, theReference) :
if time.time() – self._myLastRequestTimestamp < self.MIN_SECONDS_BETWEEN_REQUESTS :
time.sleep(self.MIN_SECONDS_BETWEEN_REQUESTS - (time.time() - self._myLastRequestTimestamp))
return self.scrape(theReference)
else :
self._myCurrentHtml = self.getHtml(self.GOOGLE_URL + self.GOOGLE_SBI_URL + theReference)
self._myLastRequestTimestamp = time.time()
def getHtml(self, theUrl) :
try :
myHtml = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, theUrl], stderr=subprocess.STDOUT)
return myHtml
except :
print "Curl error. Will sleep for 10 seconds"
time.sleep(10)
return self.getHtml(theUrl)
def getSimilarImages(self) :
myPattern = re.compile("\" href\=\"\/imgres\?imgurl\=(.*?)(\&|\%3F)")
myImages = myPattern.findall(self._myCurrentHtml)
myImagesUrls = []
for myImage in myImages :
myImagesUrls.append(myImage[0])
return myImagesUrls
def getLinkToSimilarImagesPage(self) :
myPattern = re.compile("\Visually similar images\<\/a\>“)
myPageUrl = myPattern.findall(self._myCurrentHtml)
myPageUrl = str(myPageUrl[0]).replace(“&”, “&”)
myPageUrl += “&biw=1600&bih=825” # always keep this
return self.GOOGLE_URL + myPageUrl
Leave a comment