Taiyo-ai · nileshjson · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/data/ScrapedDataCombined.csv b/data/ScrapedDataCombined.csv
@@ -0,0 +1,53 @@
+0,1,2
+"[                                        Tender Title                     Reference No          Closing Date      Bid Opening Date
+0  Renewal of Sophos XGS 31License with enhanced ...               IMMT-P-AMC-02-2020  28-Aug-2023 11:00 AM  29-Aug-2023 11:00 AM
+1                Modular Electrochemical Workstation                  IMMT/PA/10/2023  19-Sep-2023 03:00 PM  20-Sep-2023 03:00 PM
+2     Power Supply Unit Support Part Kit G8400-67028                  IMMT-PZ-16-2023  28-Aug-2023 11:00 AM  29-Aug-2023 11:00 AM
+3  Procurement of Main Turbine Spares for U1 TG R...                        100000861  26-Aug-2023 02:30 PM  28-Aug-2023 03:00 PM
+4  Non-Comprehensive AMC of UV-VIS CARY 4000-5000...               IMMT-P-AMC-04-2023  28-Aug-2023 11:00 AM  29-Aug-2023 11:00 AM
+5  SITC of VRVor VRF type Air-Conditioning system...    HITES/FM/JIPMER/AC/23-24/002   21-Sep-2023 03:00 PM  22-Sep-2023 03:00 PM
+6  Rehabilitation of Rail Track for Stacker Recla...                         CS 5203R  05-Sep-2023 03:30 PM  06-Sep-2023 03:30 PM
+7  RFP Document for Selection of Partner for Impl...       TCIL/IT T III/VSK/01/23 24  31-Aug-2023 01:00 PM  31-Aug-2023 01:30 PM
+8  Improvement/ Development of flooring by replac...    EE(M)-IV/CNZ/TC/2023-24/22-01  28-Aug-2023 03:00 PM  28-Aug-2023 03:30 PM
+9  1Hiring of No. Trailer Mounted Seater Mobile T...  EE(E and M-I)/MCD/2023-24/D-118  30-Aug-2023 03:00 PM  30-Aug-2023 03:05 PM,            Corrigendum Title                                     Reference No          Closing Date      Bid Opening Date
+0              Corrigendum 3     NHAI/Finance/TOT/ 2023-24/Bundle-14/E-206168  21-Sep-2023 11:00 AM  22-Sep-2023 11:30 AM
+1              Corrigendum 3  NHAI/Finance/TOT/2023-24/ Bundle-13-14/E-206168  21-Sep-2023 11:00 AM  22-Sep-2023 11:30 AM
+2              Corrigendum 1                      AAI/INDORE/CIVIL/2023-24/03  09-Sep-2023 06:00 PM  11-Sep-2023 11:00 AM
+3   Extension of tender date                       NHAI/PIU/CHD/11084/VUP/RFP  29-Aug-2023 11:00 AM  30-Aug-2023 11:30 AM
+4  Bid Auto Extn Corrigendum                        AAI/PTK/COMML/CLUB/2023/1  29-Aug-2023 06:00 PM  30-Aug-2023 06:00 PM
+5  Bid Auto Extn Corrigendum                    AAI/TRY/ENGG(E)/CW-02/2023-24  29-Aug-2023 06:00 PM  31-Aug-2023 11:00 AM
+6           Date Corrigendum                     IITM/SPS/Advisor/010/2023-24  11-Sep-2023 02:00 PM  12-Sep-2023 03:00 PM
+7              Corrigendum 1  TCIL/DT/ITTIV/CRIS/SIEM/I/23                     23-Aug-2023 04:00 PM  24-Aug-2023 04:30 PM
+8             AMENDMENT NO.2               HITES/PCD/AIIMS-IV/ 70/Mix/2023-24  11-Sep-2023 01:00 PM  12-Sep-2023 02:30 PM
+9        1Due Date Extension                    NFV/PUR/AMC220090_AMC220102/Y  31-Aug-2023 02:00 PM  01-Sep-2023 02:30 PM]","                                               Title                                        Description
+0  Wuyishan East Station to Wuyishan Scenic Area ...  Wuyishan East Station to Wuyishan Scenic Area ...
+1  The ecological demonstration project of Yiluo ...  The ecological demonstration project of Yiluo ...
+2  The lighting project of the buildings and stre...  The lighting project of the buildings and stre...
+3      The Huhan East Lake Greenway Phase II Project  The Huhan East Lake Greenway Phase II Project ...
+4  The Governance and Restoration Project of Mari...  The Project is a newly-built PPP project, in t...
+5  Watershed Governance for the Botanical Garden ...  The project is a newly-built project, in the s...
+6  The Education and Culture Park in Yongsheng Co...  The project is a newly-built project in the se...","[                                      交易公告        Date
+0   【江苏】HLG2023023-JC013泰州市第二中学附属初中泰州市第...  2023-08-23
+1           【河北】赤城县行政审批局新建市民中心及能力提升改造项目...  2023-08-22
+2           【湖北】通津港及配套基础设施建设项目一期智慧物流产业园...  2023-08-22
+3       【广东】2023年度韶关市武江区第一片区耕地恢复项目（重阳片区...  2023-08-22
+4                  【江苏】2023年沿河镇污水管网“十必接”工程  2023-08-22
+5       【海南】海口市自动体外除颤仪（AED）采购项目（二次招标）—采...  2023-08-22
+6                    【青海】2023年公务用车多元化保障项目.  2023-08-22
+7         【重庆】秀山工业园区中医药产业园中药材精加工定制化标准厂房...  2023-08-22
+8         【内蒙古】翁牛特旗亿合公镇人民政府旱泡子村藜麦晾晒场和烘干...  2023-08-22
+9         【四川】成都纺织高等专科学校数字商贸实训中心——电子商务课...  2023-08-22
+10        【广西】广西众鼎建设工程咨询有限公司覃塘区万亩藕虾综合种养...  2023-08-22
+11      【云南】祥云县中医医院2023年第一批医疗设备采购项目公开招标...  2023-08-22,                                          成交公示        Date
+0               【江苏】云龙区自然资源和规划局成交公示(2023-25号)  2023-08-22
+1              【甘肃】武山县人民医院重症监护能力医疗设备采购公开招标...  2023-08-22
+2                【广东】信宜市大成镇卫生院16排螺旋CT采购项目结果公告  2023-08-22
+3   【河北】Z1300002321291001河北省企业信用研究中心云网融合技术...  2023-08-22
+4          【四川】中国共产党德阳市委员会宣传部2023年度文明交通引导中...  2023-08-22
+5            【青海】祁连县黑河治理工程（鹿场及野牛沟乡政府段）监理服务...  2023-08-22
+6          【广西】广西俊航项目管理有限公司2024年桂平市高标准农田新建...  2023-08-22
+7              【黑龙江】黑龙江省林业卫生学校2023年水暖维修工程结果公告  2023-08-22
+8          【辽宁】辽宁省老干部教育活动中心服务站设备采购中标公告(LNZ...  2023-08-22
+9            【内蒙古】锡林浩特市巴彦宝拉格苏木人民政府锡林浩特市巴彦宝...  2023-08-22
+10           【安徽】宜秀区基础设施补短板（一期）建设项目、宜秀区基础设...  2023-08-22
+11          【湖南】新化县文田镇中心幼儿园设施设备采购项目(第二次）暂停...  2023-08-22]"
diff --git a/dummy-data-product/src/main.py b/dummy-data-product/src/main.py
@@ -0,0 +1,20 @@
+from scrapper import Scrapper
+import pandas as pd
+from result import results
+
+urls = ['https://etenders.gov.in/eprocure/app',
+        'https://www.cpppc.org/en/PPPyd.jhtml',
+        'http://www.ggzy.gov.cn/']
+
+if __name__ == '__main__':
+
+    resultss = []
+
+    resultss.append([results(url) for url in urls])
+
+    df=pd.DataFrame(resultss)
+
+    #to save a CSV file
+    df.to_csv('ScrapedDataCombined.csv', index=False)
+
+    print(resultss)
diff --git a/dummy-data-product/src/requirements.txt b/dummy-data-product/src/requirements.txt
@@ -0,0 +1,3 @@
+pandas - 2.0.3
+BeautifulSoup - 4.12.2
+requests - 2.31.0
diff --git a/dummy-data-product/src/result.py b/dummy-data-product/src/result.py
@@ -0,0 +1,29 @@
+import requests
+from scrapper import Scrapper
+from bs4 import BeautifulSoup
+def results(url):
+
+  res = requests.get(url)
+  if str(res) != '<Response [200]>':
+    df = 'Not Allowed'
+
+  else: 
+    soup = BeautifulSoup(res.text, 'html.parser')
+    #checking all the links throught their tags and classname or id
+    res1 = soup.find_all('table', id = 'activeTenders')
+    res2 = soup.find_all('ul', 'new-content ppp-list')
+    res3 = soup.find_all('div', class_ = 'main_list_on')
+
+    if len(res1) != 0 and len(res2) == 0 and len(res3) == 0:
+      df = Scrapper(url).etender()
+
+    elif len(res1) == 0 and len(res2) != 0 and len(res3) == 0:
+      df = Scrapper(url).cppc()
+
+    elif len(res1) == 0 and len(res2) == 0 and len(res3) != 0:
+      df = Scrapper(url).ctender()
+
+    else: 
+      df = 'None'
+
+  return df
diff --git a/dummy-data-product/src/scrapper.py b/dummy-data-product/src/scrapper.py
@@ -0,0 +1,123 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import re
+import requests
+
+class Scrapper:
+    def __init__(self, url):
+        self.url = url
+#Goverment of India tenders
+    def etender(self):
+        res = requests.get(self.url)
+        soup = BeautifulSoup(res.text,'html.parser')    
+
+        rows1=[]
+        #scrapping activetenders table
+        table1 = soup.find('table', id = 'activeTenders')
+        for tr in table1.find_all('tr'):
+            data = tr.find_all('td')
+            rows1.append([td.text for td in data])
+
+        df = pd.DataFrame(rows1)
+
+        #scrapping corrigendums table
+
+        table2 = soup.find('table', id = 'activeCorrigendums')
+        rows2 = []
+        for tr in table2.find_all('tr'):
+            data = tr.find_all('td')
+            rows2.append([td.text for td in data])
+
+
+        headers = []
+        for row in soup.find_all('tr', class_ = 'list_header'):
+             data = row.find_all('td')
+             headers.append([td.text for td in data])
+
+        df1 = pd.DataFrame(rows1, columns = headers[0])
+
+        df2 = pd.DataFrame(rows2, columns = headers[2])     
+
+        #seprating the digits coming along with the title
+
+        df1['Tender Title'] =  df1['Tender Title'].apply(lambda x: re.sub('\d. ', '' , x))
+        df2['Corrigendum Title'] = df2['Corrigendum Title'].apply(lambda x: re.sub('\d. ', '', x))
+
+        #returning dataframes
+
+        return [df1, df2]
+
+    #Chinese partnership centres
+
+    def cppc(self):
+        res = requests.get(self.url)
+        cp = BeautifulSoup(res.text,'html.parser')
+        list = cp.find('ul', 'new-content ppp-list')
+        rows = []
+        #scraping list items 
+        for li in list.find_all('li'):
+            #scraping anchor tag(Heading) for each content 
+            #scraping metadata for each content
+            a = li.find('a')
+            div = li.find('div')
+            rows.append([a.text, div.text])
+        #Making title and description as seperate entity list in a dictionary and then making a dataframe of them
+        dict1={"title": [rows[i][0] for i in range(len(rows))] , "desc":[rows[i][1] for i in range(len(rows))]}
+        df3 = pd.DataFrame(dict1)
+        df3.columns = ['Title', 'Description']
+        return df3
+
+    #Chinese tenders
+
+    def ctender(self):
+        res = requests.get(self.url)
+        ct = BeautifulSoup(res.text, 'html.parser')
+
+        div1 = ct.find('div', class_ = 'main_list_on')
+
+        headers1 = []
+
+        h4=div1.find('h4')
+        headers1.append(h4.text[:4]) #its including span text too so taking only 4 letters
+
+        headers1.append('Date')
+
+        rows1 = []
+        for ul in div1.find_all('ul'):
+            data = ul.find_all('li')
+            for li in data:
+                link = li.find('a') #link extraction
+                a = li.find('span') #date extraction
+                rows1.append([link.text, a.text])
+
+        df4 = pd.DataFrame(rows1)
+        df4.columns = headers1
+
+        #scraping the right side table
+
+        div2 = ct.find('div', class_ = 'main_list_on main_list_tw')    
+
+        headers2 = []
+
+        h4_=div2.find('h4')
+        headers2.append(h4_.text[:4]) #its including span text too so taking only 4 letters
+
+        headers2.append('Date')
+
+        rows2 = []
+        for ul_ in div2.find_all('ul'):
+            data_ = ul_.find_all('li')
+            for li_ in data_:
+                link_ = li_.find('a') #link extraction
+                a_ = li_.find('span') #date extraction
+                rows2.append([link_.text, a_.text])
+
+        df5 = pd.DataFrame(rows2)
+        df5.columns = headers2
+
+        return [df4,df5]
+
+
+
+
+