Skip to content

Commit 3340658

Browse files
committed
re1stcommit
0 parents  commit 3340658

10 files changed

Lines changed: 376 additions & 0 deletions

File tree

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Result-Data-Analyzer
2+
A RESULT DATA SCRAPPER AND ANALYZER with CAPTCHA SOLVER using tesseract-OCR, automation in SELENIUM and PANDAS dataframe.
3+
4+
A Scrapper that scraps results from various GTU result sites.
5+
6+
Implemented using Python.
7+
8+
It uses Text Captcha Human Verification to get the result.
9+
10+
Here i cracked the verification process with OCR(Optical Character Recognition) module.
11+
12+
By using selenium and pytesseract ocr module we can scrap any Text Captcha Human Verification.
13+
14+
Used Pandas for managing data in tabular format.
15+
16+
Here i provided source codes for scrapping data for analytics.
17+
18+
### youtube :
19+
20+
https://youtu.be/2nPUuaq4RRI (gturesults.in)
21+
22+
https://youtu.be/jXfwSVq0uA8 (students.gtu.ac.in)
23+
24+
#### Enjoy!
25+
26+
#### Thank You!

gturesults.in/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Note :
2+
3+
To run this in your system
4+
5+
1> you have to install tesseract-ocr in your system
6+
7+
2> you have to download your browser's driver file
8+
9+
here i use chrome browser in incognito mode so i downloaded chromedriver.exe
10+
11+
the driver is version specific
12+
13+
you have to define path for this driverfile in program

gturesults.in/g.xlsx

9.3 KB
Binary file not shown.

gturesults.in/out.xlsx

20.4 KB
Binary file not shown.

gturesults.in/scrapper V1.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
try:
2+
# import dependencies
3+
from selenium import webdriver
4+
from selenium.webdriver.common.by import By
5+
from selenium.webdriver.common.keys import Keys
6+
from selenium.webdriver.support.select import Select
7+
from selenium.webdriver.chrome.service import Service
8+
from PIL import Image, ImageCms, ImageFilter
9+
import numpy as np
10+
import pytesseract
11+
import cv2
12+
import time
13+
import os
14+
15+
## Helper Functions
16+
17+
def step1():
18+
# open webpage
19+
driver.get(URL)
20+
21+
# save captcha
22+
imdata = driver.find_element(By.ID,"imgCaptcha")
23+
with open(path, 'wb') as file:
24+
file.write(imdata.screenshot_as_png)
25+
26+
def step2():
27+
# convert to inverted mask and save img_temp
28+
im = cv2.imread(path)
29+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
30+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
31+
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
32+
Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)
33+
#Mask = cv2.bitwise_not(Mask)
34+
cv2.imwrite("old.png", Mask)
35+
36+
# open img_temp and reinvert mask
37+
img = Image.open("old.png")
38+
img = img.convert("RGBA")
39+
datas = img.getdata()
40+
newData = []
41+
for item in datas:
42+
if item[0] == 0 and item[1] == 0 and item[2] == 0:
43+
newData.append((255, 255, 255, 0))
44+
else:
45+
newData.append(item)
46+
img.putdata(newData)
47+
48+
# paste mask on img and save new_temp_img
49+
background = Image.open(path)
50+
background = background.convert("RGBA")
51+
background.paste(img,mask=img)
52+
background.save("new.png","PNG")
53+
54+
def step3(im):
55+
im = Image.open(im) # open last saved img
56+
im = im.crop((5,5,115,35)) # crop it
57+
# conver image to extractable form elements (deffer captcha styles)
58+
rgb = ImageCms.createProfile(colorSpace='sRGB')
59+
lab = ImageCms.createProfile(colorSpace='LAB')
60+
transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')
61+
lab_im = ImageCms.applyTransform(im=im, transform=transform)
62+
l, a, b = lab_im.split()
63+
im=l # select an element which is most extractable
64+
im = im.filter(ImageFilter.MinFilter(3)) # filter it
65+
result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable
66+
l=[]
67+
l.append(result.strip())
68+
if l[0]==" " or l[0]=="" : # if result will be empty then it will do above steps again untill it gets the result
69+
step1()
70+
step2()
71+
l[0]=step3("new.png")
72+
return l[0] # return final result (maybe right or wrong)
73+
74+
def step4(enroll,ans):
75+
# site automation
76+
sel = Select (driver.find_element(By.ID,"ddlbatch")) # focus on select element
77+
sel.select_by_value("3151$W2021$2022-03-16$current$0") # select element by giving id (specific for a semester)
78+
#sel = driver.find_element(By.ID,"ddlbatch")
79+
enr = driver.find_element(By.ID,"txtenroll")# get enrollment no. text box
80+
captex = driver.find_element(By.ID,"CodeNumberTextBox") # get captcha text box
81+
enr.send_keys(enroll) # send (type) given enrollment number to text box
82+
captex.send_keys(ans) # send (type) extracted captcha text to text box
83+
captex.send_keys(Keys.RETURN) # return (ENTER)
84+
85+
cs = driver.find_element(By.ID,"lblExamName").text
86+
87+
ere = driver.find_element(By.ID,"lblmsg").text
88+
if ere == "ERROR: Incorrect captcha code, try again." :
89+
return ("err","ERROR")
90+
if ere == "Your request count is reached to maximum limit, Please try again later." :
91+
return ("reqover","requestOVER")
92+
if ere == "Oppssss! Data not available." :
93+
return ("nodata",cs)
94+
95+
name = driver.find_element(By.ID,"lblName").text
96+
97+
csb = driver.find_element(By.ID,"lblCUPBack").text
98+
tb = driver.find_element(By.ID,"lblTotalBack").text
99+
spi = driver.find_element(By.ID,"lblSPI").text
100+
cpi = driver.find_element(By.ID,"lblCPI").text
101+
cgpa = driver.find_element(By.ID,"lblCGPA").text
102+
cp = driver.find_element(By.ID,"pt100Curr").text
103+
cup = driver.find_element(By.ID,"pt100Cuml").text
104+
return [enroll,name,cs,
105+
int(csb),int(tb),
106+
float(spi),float(cpi),float(cgpa),
107+
int(cp),int(cup)]
108+
109+
def loop():
110+
# just a loop through different enrollment numbers
111+
mynewlist = []
112+
for i in mylist :
113+
enroll = "{}".format(i)
114+
step1()
115+
step2()
116+
ans=step3("new.png")
117+
nr=step4(enroll,ans)
118+
if nr[0] == "err" :
119+
mynewlist.append(enroll)
120+
elif nr[0] == "reqover" :
121+
print("Change the SERVER!")
122+
break
123+
elif nr[0] == "nodata" :
124+
dfout.loc[len(dfout)] = [enroll,"nodata",nr[1],"-","-","-","-","-","-","-"]
125+
else :
126+
dfout.loc[len(dfout)] = nr
127+
print(enroll)
128+
return mynewlist
129+
130+
# initiate webdriver and configure options
131+
co = webdriver.ChromeOptions()
132+
co.headless = True # for headless window (not visible in desktop)
133+
co.add_argument("--incognito") # for incognito mode
134+
135+
# initiate chromedriver service by defining path (Choose chromedriver according to your chrome version)
136+
ser = Service("G:\\My Drive\\projects\\captcha_solver\\chromedriver.exe")
137+
driver = webdriver.Chrome(service=ser,options=co) # start webdriver
138+
139+
# define url and filename for download captcha_temp
140+
URL = "https://www.gturesults.in/"
141+
path="cap.jpg"
142+
143+
import pandas as pd
144+
dfin = pd.read_excel('g.xlsx')
145+
#mylist = dfin['ENRNO'].tolist() # take enrollment no. input from excel file
146+
mylist = range(190280111001,190280111200) # OR give range of enrollment no. (here i given our batch's range)
147+
dfout = dfin[0:0] # create empty dataframe for filling output data with same labels that input file has
148+
149+
# main driver programm
150+
# loop runs untill all data has scraped if any server error not happens
151+
while 1:
152+
mynewlist=loop()
153+
if len(mynewlist) != 0:
154+
mylist = mynewlist
155+
elif len(mynewlist) == 0:
156+
break
157+
# save dataframe to excel file
158+
dfout.to_excel("out.xlsx")
159+
160+
finally:
161+
driver.close() # close the window
162+
driver.quit() # stop the driver
163+
# remove unnecessary files
164+
os.remove("cap.jpg")
165+
os.remove("old.png")
166+
os.remove("new.png")

students.gtu.ac.in/Picture1.png

103 KB
Loading

students.gtu.ac.in/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Note :
2+
3+
To run this in your system
4+
5+
1> you have to install tesseract-ocr in your system
6+
7+
2> you have to download your browser's driver file
8+
9+
here i use chrome browser in incognito mode so i downloaded chromedriver.exe
10+
11+
the driver is version specific
12+
13+
you have to define path for this driverfile in program
14+
15+
16+
# Analyzed Data :
17+
### CPI-Distribution-Histogram :
18+
![Data](https://github.com/alloc7260/Result-Data-Analyzer/blob/main/students.gtu.ac.in/Picture1.png?raw=true "CPI-Distribution-Histogram")

students.gtu.ac.in/g.xlsx

9.34 KB
Binary file not shown.

students.gtu.ac.in/out.xlsx

25.2 KB
Binary file not shown.

students.gtu.ac.in/scrapper V0.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
try:
2+
# import dependencies
3+
from selenium import webdriver
4+
from selenium.webdriver.common.by import By
5+
from selenium.webdriver.common.keys import Keys
6+
from selenium.webdriver.chrome.service import Service
7+
from PIL import Image, ImageCms, ImageFilter
8+
import numpy as np
9+
import pytesseract
10+
import cv2
11+
import time
12+
import os
13+
14+
## Helper Functions
15+
16+
def step1():
17+
# open webpage
18+
driver.get(URL)
19+
20+
# save captcha
21+
imdata = driver.find_element(By.ID,"imgCaptcha")
22+
with open(path, 'wb') as file:
23+
file.write(imdata.screenshot_as_png)
24+
25+
def step2():
26+
# convert to inverted mask and save img_temp
27+
im = cv2.imread(path)
28+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
29+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
30+
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
31+
Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)
32+
#Mask = cv2.bitwise_not(Mask)
33+
cv2.imwrite("old.png", Mask)
34+
35+
# open img_temp and reinvert mask
36+
img = Image.open("old.png")
37+
img = img.convert("RGBA")
38+
datas = img.getdata()
39+
newData = []
40+
for item in datas:
41+
if item[0] == 0 and item[1] == 0 and item[2] == 0:
42+
newData.append((255, 255, 255, 0))
43+
else:
44+
newData.append(item)
45+
img.putdata(newData)
46+
47+
# paste mask on img and save new_temp_img
48+
background = Image.open(path)
49+
background = background.convert("RGBA")
50+
background.paste(img,mask=img)
51+
background.save("new.png","PNG")
52+
53+
def step3(im):
54+
im = Image.open(im) # open last saved img
55+
im = im.crop((5,5,115,35)) # crop it
56+
# conver image to extractable form elements (deffer captcha styles)
57+
rgb = ImageCms.createProfile(colorSpace='sRGB')
58+
lab = ImageCms.createProfile(colorSpace='LAB')
59+
transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')
60+
lab_im = ImageCms.applyTransform(im=im, transform=transform)
61+
l, a, b = lab_im.split()
62+
im=l # select an element which is most extractable
63+
im = im.filter(ImageFilter.MinFilter(3)) # filter it
64+
result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable
65+
l=[]
66+
l.append(result.strip())
67+
if l[0]==" " or l[0]=="" : # if result will be empty then it will do above steps again untill it gets the result
68+
step1()
69+
step2()
70+
l[0]=step3("new.png")
71+
return l[0] # return final result (maybe right or wrong)
72+
73+
def step4(enroll,ans):
74+
# site automation
75+
enr = driver.find_element(By.ID,"txtEnrollNo") # get enrollment no. text box
76+
captex = driver.find_element(By.ID,"CodeNumberTextBox") # get captcha text box
77+
enr.send_keys(enroll) # send (type) given enrollment number to text box
78+
captex.send_keys(ans) # send (type) extracted captcha text to text box
79+
captex.send_keys(Keys.RETURN) # return (ENTER)
80+
81+
ere = driver.find_element(By.ID,"lblMSG").text
82+
if ere == "ERROR: Incorrect captcha code, try again." :
83+
return "err"
84+
if ere == "No Data Found! Check Enrollment Number." :
85+
return "nodata"
86+
87+
name = driver.find_element(By.ID,"lblName").text
88+
ins = driver.find_element(By.ID,"lblInstName").text
89+
bra = driver.find_element(By.ID,"lblBranch").text
90+
l=driver.find_elements(By.XPATH, "//table[@id='grdv2']/tbody/tr[2]")
91+
l=tuple(l[0].text.split(" "))
92+
cpi,cgpa,tb = l[1],l[2],l[3]
93+
l=driver.find_elements(By.XPATH, "//table[@id='grdvLastExm']/tbody/tr[2]")
94+
l=tuple(l[0].text.split(" "))
95+
le = f"{l[0]} {l[1]} {l[2]} {l[3]} {l[4]} {l[5]} {l[6]}"
96+
dd,cb,spi = l[8],l[9],l[10]
97+
return [enroll,name,ins,bra,float(cpi),float(cgpa),int(tb),le,dd,int(cb),float(spi)]
98+
99+
def loop():
100+
# just a loop through different enrollment numbers
101+
mynewlist = []
102+
for i in mylist :
103+
enroll = "{}".format(i)
104+
step1()
105+
step2()
106+
ans=step3("new.png")
107+
nr=step4(enroll,ans)
108+
if nr == "err" :
109+
mynewlist.append(enroll)
110+
elif nr == "nodata" :
111+
dfout.loc[len(dfout)] = [enroll,"nodata","-","-","-","-","-","-","-","-","-"]
112+
else :
113+
dfout.loc[len(dfout)] = nr
114+
print(enroll)
115+
return mynewlist
116+
117+
# initiate webdriver and configure options
118+
co = webdriver.ChromeOptions()
119+
co.headless = True # for headless window (not visible in desktop)
120+
co.add_argument("--incognito") # for incognito mode
121+
122+
# initiate chromedriver service by defining path (Choose chromedriver according to your chrome version)
123+
ser = Service("G:\\My Drive\\projects\\captcha_solver\\chromedriver.exe")
124+
driver = webdriver.Chrome(service=ser,options=co) # start webdriver
125+
126+
# define url and filename for download captcha_temp
127+
URL = "https://www.students.gtu.ac.in/"
128+
path="cap.jpg"
129+
130+
import pandas as pd
131+
dfin = pd.read_excel('g.xlsx')
132+
#mylist = dfin['ENRNO'].tolist() # take enrollment no. input from excel file
133+
mylist = range(190280111001,190280111200) # OR give range of enrollment no. (here i given our batch's range)
134+
dfout = dfin[0:0] # create empty dataframe for filling output data with same labels that input file has
135+
136+
# main driver programm
137+
# loop runs untill all data has scraped if any server error not happens
138+
while 1:
139+
mynewlist=loop()
140+
if len(mynewlist) != 0:
141+
mylist = mynewlist
142+
elif len(mynewlist) == 0:
143+
break
144+
# save dataframe to excel file
145+
dfout.to_excel("out.xlsx")
146+
147+
finally:
148+
driver.close() # close the window
149+
driver.quit() # stop the driver
150+
# remove unnecessary files
151+
os.remove("cap.jpg")
152+
os.remove("old.png")
153+
os.remove("new.png")

0 commit comments

Comments
 (0)