1+ try :
2+ # import dependencies
3+ from selenium import webdriver
4+ from selenium .webdriver .common .by import By
5+ from selenium .webdriver .common .keys import Keys
6+ from selenium .webdriver .support .select import Select
7+ from selenium .webdriver .chrome .service import Service
8+ from PIL import Image , ImageCms , ImageFilter
9+ import numpy as np
10+ import pytesseract
11+ import cv2
12+ import time
13+ import os
14+
15+ ## Helper Functions
16+
17+ def step1 ():
18+ # open webpage
19+ driver .get (URL )
20+
21+ # save captcha
22+ imdata = driver .find_element (By .ID ,"imgCaptcha" )
23+ with open (path , 'wb' ) as file :
24+ file .write (imdata .screenshot_as_png )
25+
26+ def step2 ():
27+ # convert to inverted mask and save img_temp
28+ im = cv2 .imread (path )
29+ gray = cv2 .cvtColor (im , cv2 .COLOR_BGR2GRAY )
30+ thresh = cv2 .threshold (gray , 0 , 255 , cv2 .THRESH_BINARY_INV + cv2 .THRESH_OTSU )[1 ]
31+ horizontal_kernel = cv2 .getStructuringElement (cv2 .MORPH_RECT , (25 , 1 ))
32+ Mask = cv2 .morphologyEx (thresh , cv2 .MORPH_OPEN ,horizontal_kernel , iterations = 2 )
33+ #Mask = cv2.bitwise_not(Mask)
34+ cv2 .imwrite ("old.png" , Mask )
35+
36+ # open img_temp and reinvert mask
37+ img = Image .open ("old.png" )
38+ img = img .convert ("RGBA" )
39+ datas = img .getdata ()
40+ newData = []
41+ for item in datas :
42+ if item [0 ] == 0 and item [1 ] == 0 and item [2 ] == 0 :
43+ newData .append ((255 , 255 , 255 , 0 ))
44+ else :
45+ newData .append (item )
46+ img .putdata (newData )
47+
48+ # paste mask on img and save new_temp_img
49+ background = Image .open (path )
50+ background = background .convert ("RGBA" )
51+ background .paste (img ,mask = img )
52+ background .save ("new.png" ,"PNG" )
53+
54+ def step3 (im ):
55+ im = Image .open (im ) # open last saved img
56+ im = im .crop ((5 ,5 ,115 ,35 )) # crop it
57+ # conver image to extractable form elements (deffer captcha styles)
58+ rgb = ImageCms .createProfile (colorSpace = 'sRGB' )
59+ lab = ImageCms .createProfile (colorSpace = 'LAB' )
60+ transform = ImageCms .buildTransform (inputProfile = rgb , outputProfile = lab , inMode = 'RGB' , outMode = 'LAB' )
61+ lab_im = ImageCms .applyTransform (im = im , transform = transform )
62+ l , a , b = lab_im .split ()
63+ im = l # select an element which is most extractable
64+ im = im .filter (ImageFilter .MinFilter (3 )) # filter it
65+ result = pytesseract .image_to_string (im ) # send it to ocr and save results to a variable
66+ l = []
67+ l .append (result .strip ())
68+ if l [0 ]== " " or l [0 ]== "" : # if result will be empty then it will do above steps again untill it gets the result
69+ step1 ()
70+ step2 ()
71+ l [0 ]= step3 ("new.png" )
72+ return l [0 ] # return final result (maybe right or wrong)
73+
74+ def step4 (enroll ,ans ):
75+ # site automation
76+ sel = Select (driver .find_element (By .ID ,"ddlbatch" )) # focus on select element
77+ sel .select_by_value ("3151$W2021$2022-03-16$current$0" ) # select element by giving id (specific for a semester)
78+ #sel = driver.find_element(By.ID,"ddlbatch")
79+ enr = driver .find_element (By .ID ,"txtenroll" )# get enrollment no. text box
80+ captex = driver .find_element (By .ID ,"CodeNumberTextBox" ) # get captcha text box
81+ enr .send_keys (enroll ) # send (type) given enrollment number to text box
82+ captex .send_keys (ans ) # send (type) extracted captcha text to text box
83+ captex .send_keys (Keys .RETURN ) # return (ENTER)
84+
85+ cs = driver .find_element (By .ID ,"lblExamName" ).text
86+
87+ ere = driver .find_element (By .ID ,"lblmsg" ).text
88+ if ere == "ERROR: Incorrect captcha code, try again." :
89+ return ("err" ,"ERROR" )
90+ if ere == "Your request count is reached to maximum limit, Please try again later." :
91+ return ("reqover" ,"requestOVER" )
92+ if ere == "Oppssss! Data not available." :
93+ return ("nodata" ,cs )
94+
95+ name = driver .find_element (By .ID ,"lblName" ).text
96+
97+ csb = driver .find_element (By .ID ,"lblCUPBack" ).text
98+ tb = driver .find_element (By .ID ,"lblTotalBack" ).text
99+ spi = driver .find_element (By .ID ,"lblSPI" ).text
100+ cpi = driver .find_element (By .ID ,"lblCPI" ).text
101+ cgpa = driver .find_element (By .ID ,"lblCGPA" ).text
102+ cp = driver .find_element (By .ID ,"pt100Curr" ).text
103+ cup = driver .find_element (By .ID ,"pt100Cuml" ).text
104+ return [enroll ,name ,cs ,
105+ int (csb ),int (tb ),
106+ float (spi ),float (cpi ),float (cgpa ),
107+ int (cp ),int (cup )]
108+
109+ def loop ():
110+ # just a loop through different enrollment numbers
111+ mynewlist = []
112+ for i in mylist :
113+ enroll = "{}" .format (i )
114+ step1 ()
115+ step2 ()
116+ ans = step3 ("new.png" )
117+ nr = step4 (enroll ,ans )
118+ if nr [0 ] == "err" :
119+ mynewlist .append (enroll )
120+ elif nr [0 ] == "reqover" :
121+ print ("Change the SERVER!" )
122+ break
123+ elif nr [0 ] == "nodata" :
124+ dfout .loc [len (dfout )] = [enroll ,"nodata" ,nr [1 ],"-" ,"-" ,"-" ,"-" ,"-" ,"-" ,"-" ]
125+ else :
126+ dfout .loc [len (dfout )] = nr
127+ print (enroll )
128+ return mynewlist
129+
130+ # initiate webdriver and configure options
131+ co = webdriver .ChromeOptions ()
132+ co .headless = True # for headless window (not visible in desktop)
133+ co .add_argument ("--incognito" ) # for incognito mode
134+
135+ # initiate chromedriver service by defining path (Choose chromedriver according to your chrome version)
136+ ser = Service ("G:\\ My Drive\\ projects\\ captcha_solver\\ chromedriver.exe" )
137+ driver = webdriver .Chrome (service = ser ,options = co ) # start webdriver
138+
139+ # define url and filename for download captcha_temp
140+ URL = "https://www.gturesults.in/"
141+ path = "cap.jpg"
142+
143+ import pandas as pd
144+ dfin = pd .read_excel ('g.xlsx' )
145+ #mylist = dfin['ENRNO'].tolist() # take enrollment no. input from excel file
146+ mylist = range (190280111001 ,190280111200 ) # OR give range of enrollment no. (here i given our batch's range)
147+ dfout = dfin [0 :0 ] # create empty dataframe for filling output data with same labels that input file has
148+
149+ # main driver programm
150+ # loop runs untill all data has scraped if any server error not happens
151+ while 1 :
152+ mynewlist = loop ()
153+ if len (mynewlist ) != 0 :
154+ mylist = mynewlist
155+ elif len (mynewlist ) == 0 :
156+ break
157+ # save dataframe to excel file
158+ dfout .to_excel ("out.xlsx" )
159+
160+ finally :
161+ driver .close () # close the window
162+ driver .quit () # stop the driver
163+ # remove unnecessary files
164+ os .remove ("cap.jpg" )
165+ os .remove ("old.png" )
166+ os .remove ("new.png" )
0 commit comments