Search
Initial Web Scraping

The initial web scraping was made using the following code. BeautifulSoup is the library that helped us scrape the data we needed. Pymongo is used to connect to our database. You can click the toggle button to view the code.

from bs4 import BeautifulSoup
from requests import get
import csv
import time
import pandas as pd
import pymongo
from pymongo import MongoClient

headers = ({'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})


metrics = ["ID","Make","Model","Body","First Registration","Fuel","Mileage","Power(hp)",
"Gearing Type","Displacement","Warranty","Full Service","Non-smoking Vehicle",
"Model Code","ABS","Driver-side airbag","Passenger-side airbag",
"Sunroof","Radio","4WD","Power windows","Alloy wheels","Central door lock","Alarm system",
"Navigation system","Immobilizer","Side airbag","Seat heating","Disabled accessible","Cruise control",
"Xenon headlights","On-board computer","Electronic stability control","Fog lights","Trailer hitch","Air conditioning",
"Roof rack","Power steering","Automatic climate control","Traction control","Electrically adjustable seats",
"MP3","Panorama roof","Auxiliary heating","Sport package","Start-stop system","Multi-function steering wheel",
"Daytime running lights","Sport suspension","Sport seats",
"Adaptive headlights","Ski bag","Adaptive Cruise Control",
"Armrest","Electrically heated windshield", "Heated steering wheel",
"Hill Holder","Digital radio","LED Headlights","Electric tailgate",
"LED Daytime Running Lights","Leather steering wheel",
"Air suspension","Massage seats","Night view assist","Tire pressure monitoring system",
"Keyless central door lock","Lane departure warning system",
"Blind spot monitor","Touch screen","USB","Traffic sign recognition",
"Electrical side mirrors","Bluetooth","Isofix","Rain sensor",
"Parking assist system sensors front","Parking assist system sensors rear",
"Parking assist system camera","Parking assist system self-steering",
"CD player","Price"]

car_metrics = ["ID","Make","Model","Body","First Registration","Fuel","Mileage","Power(hp)",
"Gearing Type","Displacement","Warranty","Full Service","Non-smoking Vehicle",
"Model Code","Price"]

eq_metrics = ["AdID","ABS","Driver-side airbag","Passenger-side airbag",
"Sunroof","Radio","4WD","Power windows","Alloy wheels","Central door lock","Alarm system",
"Navigation system","Immobilizer","Side airbag","Seat heating","Disabled accessible","Cruise control",
"Xenon headlights","On-board computer","Electronic stability control","Fog lights","Trailer hitch","Air conditioning",
"Roof rack","Power steering","Automatic climate control","Traction control","Electrically adjustable seats",
"MP3","Panorama roof","Auxiliary heating","Sport package","Start-stop system","Multi-function steering wheel",
"Daytime running lights","Sport suspension","Sport seats",
"Adaptive headlights","Ski bag","Adaptive Cruise Control",
"Armrest","Electrically heated windshield", "Heated steering wheel",
"Hill Holder","Digital radio","LED Headlights","Electric tailgate",
"LED Daytime Running Lights","Leather steering wheel",
"Air suspension","Massage seats","Night view assist","Tire pressure monitoring system",
"Keyless central door lock","Lane departure warning system",
"Blind spot monitor","Touch screen","USB","Traffic sign recognition",
"Electrical side mirrors","Bluetooth","Isofix","Rain sensor",
"Parking assist system sensors front","Parking assist system sensors rear",
"Parking assist system camera","Parking assist system self-steering",
"CD player"]

audi_models = ['100','200','50','80','90','A1','A2','A3','A4','A4-allroad','A7','A8','Allroad','Cabriolet','Coupe','e-tron','Q1'
                ,'Q2','Q3','Q5','Q7','Q8','QUATTRO',
                'R8','R5','RS-Q3','RS-Q5','RS-Q8','RS2','RS3','RS4','RS5','RS6','RS7','S1','S2'
                ,'S3','S4','S5','S6','S7','S8','SQ2','SQ5','SQ7','SQ8','TT','TT-RS','TTS','V8']



ford_models = ["aerostar","b-max","bronco","c-max","capri","connect-elekto","consul","cougar","courier","crown","econoline",
               "econovan","ecosport","edge","escape","escort","excursion","expedition","explorer","express","f-1","f-100"
                ,"f-150","f-250","f-350","f-360","f-450","f-550","f-650","f-super-duty","fairlane","falcon","fiesta","flex"
                ,"focus","focus-c-max","focus-cc","freestar","freestyle","fusion","galaxy","gran-torino","granada","grand-c-max"
                ,"gt","ka/ka+","kuga","m","maverick","mercury","mondeo","mustang","orion","probe","puma","ranger","rs-200",
               "s-max","scorpio","sierra","sportka","streetka","taunus","taurus","thunderbird","torino","tourneo-(all)","tourneo"
                ,"tourneo-connect","tourneo-courier","tourneo-custom","transit-(all)","transit","transit-bus","transit-connect"
                ,"transit-courier","transit-custom","windstar"]

seat_models = ["Alhambra","Altea","Altea-XL","Arona","Arosa","Ateca","Cordoba","Exeo","Fura","Ibiza","Inca","Leon","Malaga",
               "Marbella","Mii","Panda","Ronda","Tarraco","Terra","Toledo"]
model_dict={}
makes = ["Audi", "Ford","Seat"]
model_dict["Audi"] = audi_models
model_dict["Ford"] = ford_models
model_dict["Seat"] = seat_models
url = "https://www.autoscout24.com"


with open('cars.csv', 'a') as c, open('equipment.csv','a') as eq,open('cars_simple.csv','a') as cs:
    writer = csv.writer(c, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
    writer2 = csv.writer(eq, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
    writer3 = csv.writer(cs, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
    writer.writerow(metrics)
    writer2.writerow(eq_metrics)
    writer3.writerow(car_metrics)
            
def baraj(make, model, year):
    
    for page in range(1,21):
        base_url =  "https://www.autoscout24.com/lst/{0}/{1}?sort=standard&desc=0&ustate=N%2CU&size=20&page={2}&fregto={3}&fregfrom={4}&atype=C&".format(make,model,page,year,year)
        response = get(base_url, headers=headers)
        html_soup1 = BeautifulSoup(response.text, 'html.parser')
        if(html_soup1.find('div',{'class':'cl-empty-listings-summary'}) is None):
            for a in html_soup1.findAll('a',{"data-item-name": "detail-page-link"}): #sekoj poseben oglas
                r = {}
                url1 = url + a["href"]  #linkot od oglasot
                ID = ''
                split  = url1.split('-')
                print(url1)
                for pole in range (len(split)-5,len(split)):
                        ID+=(split[pole])
                response = get(url1, headers=headers)
                html_soup1 = BeautifulSoup(response.text, 'html.parser')
                w = html_soup1.find('span',{"class":"sc-font-l cldt-stage-primary-keyfact"})
                if(w is not None and w.text.split()[0] is not '-'):
                    r["Mileage"] = (int((w.text).split()[0].replace(",","")))
                w = html_soup1.find('span',{"id":"basicDataFirstRegistrationValue"})
                if (w is not None and w.text is not '-'):
                    r["First Registration"] = w.text
                w = html_soup1.find('span',{"class":"sc-font-m cldt-stage-primary-keyfact"})
                if(w is not None and w.text.split()[0] is not '-'):
                    r["Power(hp)"] = (int((w.text.split()[0])))
                r["Price"] = ((html_soup1.find('div',{'class':'cldt-price'}).find('h2').text).split()[1].split(".")[0].replace(',',''))
                for a in html_soup1.findAll('div',{"class":"cldt-equipment-block sc-grid-col-3 sc-grid-col-m-4 sc-grid-col-s-12 sc-pull-left"}):  #dodatna oprema
                    for b in a.find_all('span'):
                        #print(b.text)
                        r[b.text] = 1
                lst1 = []
                lst2 = []

                for a in html_soup1.findAll('dl'): # paramentri
                    for b in a.find_all('dt'):
                        #print(b.text)
                        lst1.append(b.text)

                for a in html_soup1.findAll('dl'): # vrednosti
                    for b in a.find_all('dd'):
                        #print(b.text.strip())
                        if(b.text.strip() == ""):
                            lst2.append(1)
                        else:
                            lst2.append(b.text.strip())    

                for i in range(0,len(lst1)): #spojuvanje
                    if(lst1[i]=='Displacement'):
                        r[lst1[i]]=int(lst2[i].split()[0].replace(",",""))
                    elif(lst1[i]=='First Registration'):
                        continue
                    else:
                        r[lst1[i]]=lst2[i]

                final = {}
                car_final = {}
                eq_final = {}
                final["ID"]=ID
                car_final["ID"]=ID
                eq_final["AdID"]=ID

                for metric in metrics:
                    if(metric in r):
                        final[metric] = r[metric]
                    elif (metric=="ID"):
                        continue
                    else:
                        final[metric] = None

                for metric in car_metrics:
                    if(metric in r):
                        car_final[metric] = r[metric]
                    elif (metric=="ID"):
                        continue
                    else:
                        car_final[metric] = None

                for metric in eq_metrics:
                    if(metric in r):
                        eq_final[metric] = r[metric]
                    elif (metric=="AdID"):
                        continue
                    else:
                        eq_final[metric] = None

                with open('cars.csv', 'a') as c, open('equipment.csv','a') as eq,open('cars_simple.csv','a') as cs:
                    writer = csv.writer(c, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
                    writer2 = csv.writer(eq, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
                    writer3 = csv.writer(cs, quoting=csv.QUOTE_NONNUMERIC, delimiter=',')
                    writer.writerow(list(final.values()))
                    writer2.writerow(list(eq_final.values()))
                    writer3.writerow(list(car_final.values()))
                time.sleep(2)

        else:
            print("Empty query for make {0}, model {1}, year{2}".format(make,model,year))
            break
    
      


for make in makes:
    for model in model_dict[make]:
        for year in range(1990,2021):
            baraj(make,model,year)
            
#Popolni <username> i <pass>
client = MongoClient('mongodb://<Username>:<Password>@dwprojectcluster-shard-00-00.lpqbf.mongodb.net:27017,dwprojectcluster-shard-00-01.lpqbf.mongodb.net:27017,dwprojectcluster-shard-00-02.lpqbf.mongodb.net:27017/<dbname>?ssl=true&replicaSet=atlas-672mwy-shard-0&authSource=admin&retryWrites=true&w=majority')

df = pd.read_csv('cars.csv')

#Izbrisi prazni redovi
df = df[df.ID.notnull()]
df = df.reset_index()
df.drop('index',axis=1, inplace=True)

#Tabela bez prazni redovi
df.to_csv('cars2.csv',index = False)

#Relativno posporo resenie od df.to_dict('records'), megjutoa za razlika od nego gi ignorira NaN vrednostite.
clean_dict_list = df.T.apply(lambda x: x.dropna().to_dict()).tolist()

client.cars_database.cars.insert_many(clean_dict_list)