Scheduled Web Scraping

When it comes to script for scheduled web scraping we have made some improvments in the code. With the current code, data for the ads and sellers collections is also retrieved. This script is put on the DigitalOcean droplet and it is executed every day at 08:05AM using this cron command:

5 8 * * * usr/bin/python script_path/script.py

Click the toggle button to expand the code

from bs4 import BeautifulSoup
from requests import get
from pymongo import MongoClient
import pandas as pd
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

headers = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
    /537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})

metrics = ["ID", "Make", "Model", "Body", "First Registration", "Fuel", "Mileage", "Power(hp)", "Gearing Type",
           "Displacement", "Warranty", "Full Service", "Non-smoking Vehicle", "Model Code", "ABS", "Driver-side airbag",
           "Passenger-side airbag", "Sunroof", "Radio", "4WD", "Power windows", "Alloy wheels", "Central door lock",
           "Alarm system", "Navigation system", "Immobilizer", "Side airbag", "Seat heating", "Disabled accessible",
           "Cruise control", "Xenon headlights", "On-board computer", "Electronic stability control", "Fog lights",
           "Trailer hitch", "Air conditioning", "Roof rack", "Power steering", "Automatic climate control",
           "Traction control", "Electrically adjustable seats", "MP3", "Panorama roof", "Auxiliary heating",
           "Sport package", "Start-stop system", "Multi-function steeripopng wheel", "Daytime running lights",
           "Sport suspension", "Sport seats", "Adaptive headlights", "Ski bag", "Adaptive Cruise Control", "Armrest",
           "Electrically heated windshield", "Heated steering wheel", "Hill Holder", "Digital radio", "LED Headlights",
           "Electric tailgate", "LED Daytime Running Lights", "Leather steering wheel", "Air suspension",
           "Massage seats", "Night view assist", "Tire pressure monitoring system", "Keyless central door lock",
           "Lane departure warning system", "Blind spot monitor", "Touch screen", "USB", "Traffic sign recognition",
           "Electrical side mirrors", "Bluetooth", "Isofix", "Rain sensor", "Parking assist system sensors front",
           "Parking assist system sensors rear", "Parking assist system camera", "Parking assist system self-steering",
           "CD player", "Price"]

car_metrics = ["ID", "Make", "Model", "Body", "First Registration", "Fuel", "Mileage", "Power(hp)", "Gearing Type",
               "Displacement", "Warranty", "Full Service", "Non-smoking Vehicle", "Model Code", "Price"]

eq_metrics = ["AdID", "ABS", "Driver-side airbag", "Passenger-side airbag", "Sunroof", "Radio", "4WD", "Power windows",
              "Alloy wheels", "Central door lock", "Alarm system", "Navigation system", "Immobilizer", "Side airbag",
              "Seat heating", "Disabled accessible", "Cruise control", "Xenon headlights", "On-board computer",
              "Electronic stability control", "Fog lights", "Trailer hitch", "Air conditioning", "Roof rack",
              "Power steering", "Automatic climate control", "Traction control", "Electrically adjustable seats", "MP3",
              "Panorama roof", "Auxiliary heating", "Sport package", "Start-stop system",
              "Multi-function steering wheel", "Daytime running lights", "Sport suspension", "Sport seats",
              "Adaptive headlights", "Ski bag", "Adaptive Cruise Control", "Armrest", "Electrically heated windshield",
              "Heated steering wheel", "Hill Holder", "Digital radio", "LED Headlights", "Electric tailgate",
              "LED Daytime Running Lights", "Leather steering wheel", "Air suspension", "Massage seats",
              "Night view assist", "Tire pressure monitoring system", "Keyless central door lock",
              "Lane departure warning system", "Blind spot monitor", "Touch screen", "USB", "Traffic sign recognition",
              "Electrical side mirrors", "Bluetooth", "Isofix", "Rain sensor", "Parking assist system sensors front",
              "Parking assist system sensors rear", "Parking assist system camera",
              "Parking assist system self-steering", "CD player"]

AD_METRICS = ["CarID", "SellerID", "Title", "Price", "Type", "URL"]
SELLER_METRICS = ["SellerID", "Vendor", "City", "ZipCode", "Country"]

old_metrics = list(map(lambda x: x+'_old', metrics))
old_ads = list(map(lambda x: x+'_old', AD_METRICS))
old_sellers = list(map(lambda x: x+'_old', SELLER_METRICS))



list_of_valid_years = [str(year) for year in range(1990, 2021)]

model_dict = {}
makes = ['Opel', 'Audi', 'Volkswagen', 'BMW', 'Renault', 'Mercedes-Benz', 'Seat', 'Fiat']
url = "https://www.autoscout24.com"

df_metrics = pd.DataFrame(columns=metrics)
df_car_metrics = pd.DataFrame(columns=car_metrics)
df_eq_metrics = pd.DataFrame(columns=eq_metrics)
df_ads_metrics = pd.DataFrame(columns=AD_METRICS)
df_seller_metrics = pd.DataFrame(columns=SELLER_METRICS)

# connecting to mongodb with mongo clinet
client = MongoClient('mongodb://<UserName>:<Password>@'
                    'dwprojectcluster-shard-00-00.lpqbf.mongodb.net:27017,'
                     'dwprojectcluster-shard-00-01.lpqbf.mongodb.net:27017,'
                     'dwprojectcluster-shard-00-02.lpqbf.mongodb.net:27017/'
                     '<dbname>?ssl=true&replicaSet=atlas-672mwy-shard-0'
                     '&authSource=admin&retryWrites=true&w=majority')


def return_ads_sellers(url1):
    TMP_AD_DICT = {}
    TMP_SELLER_DICT = {}

    AD_URL = url1

    try:
        uClient = uReq(AD_URL)
        page_html = uClient.read()
        uClient.close()
    except:
        print("An error occured at {0}".format(AD_URL))

    ad_soup = soup(page_html, "html.parser")

    details = ad_soup.findAll("div", {"class": "cldt-item"})[1]
    tmp = ad_soup.findAll("as24-tracking")
    stage_data = ad_soup.findAll("div", {"class": "cldt-stage-data"})[0]

    PRICE = stage_data.findAll("div", {"class": "cldt-price"})[0].h2.text.strip().split(" ")[1].split(".")[0]

    TMP_AD_DICT["CarID"] = \
    [elem for elem in tmp if "classified_productID" in str(elem)][0]['as24-tracking-value'].split(":")[1][
    :-1].strip().split("\"")[1]
    TMP_AD_DICT["SellerID"] = \
    [elem for elem in tmp if "classified_customerID" in str(elem)][0]['as24-tracking-value'].split(":")[1][
    :-1].strip().split("\"")[1]
    TMP_AD_DICT["Title"] = ad_soup.find("h1", {"class": "cldt-detail-title sc-ellipsis"}).text.strip()

    TMP_AD_DICT["Price"] = int(PRICE.replace(',', ''))

    try:
        TMP_AD_DICT["Type"] = details.find("a", {"class": "cldt-stealth-link"}).text
    except:
        TMP_AD_DICT["Type"] = ad_soup.findAll("dd")[0].text.strip()

    TMP_AD_DICT["URL"] = AD_URL
    if '\r\n' in TMP_AD_DICT["Title"]:
        TMP_AD_DICT["Title"] = TMP_AD_DICT["Title"].replace('\r\n', ' ')
    elif '\n' in TMP_AD_DICT["Title"]:
        TMP_AD_DICT["Title"] = TMP_AD_DICT["Title"].replace('\n', ' ')

    try:
        try:
            vendor = stage_data.find("h3", {"data-item-name": "vendor-company-name"}).text
        except:
            vendor = stage_data.find("h3", {"data-item-name": "vendor-private-seller-title"}).text
    except:
        print("Skipping")

    city = ' '.join(
        map(str, stage_data.findAll("div", {"data-item-name": "vendor-contact-city"})[0].text.split(" ")[1:]))
    zip_code = stage_data.findAll("div", {"data-item-name": "vendor-contact-city"})[0].text.split(" ")[0]
    try:
        country = stage_data.findAll("div", {"data-item-name": "vendor-contact-country"})[0].text
    except:
        country = "NULL"

    TMP_SELLER_DICT["SellerID"] = \
    [elem for elem in tmp if "classified_customerID" in str(elem)][0]['as24-tracking-value'].split(":")[1][
    :-1].strip().split("\"")[1]
    TMP_SELLER_DICT["Vendor"] = vendor
    TMP_SELLER_DICT["City"] = city
    TMP_SELLER_DICT["ZipCode"] = int(zip_code)
    TMP_SELLER_DICT["Country"] = country

    return TMP_AD_DICT, TMP_SELLER_DICT


def baraj(make):
    local_df_metrics = pd.DataFrame(columns=metrics)
    local_df_car_metrics = pd.DataFrame(columns=car_metrics)
    local_df_eq_metrics = pd.DataFrame(columns=eq_metrics)
    local_df_ads_metrics = pd.DataFrame(columns=AD_METRICS)
    local_df_seller_metrics = pd.DataFrame(columns=SELLER_METRICS)

    for page in range(1, 21):

        # srotirano po most recent
        base_url = f"https://www.autoscout24.com/lst/{make}?sort=age&desc=0&ustate=N%2CU&size=20&page={page}&atype=C&"
        response = get(base_url, headers=headers)
        html_soup1 = BeautifulSoup(response.text, 'html.parser')

        if html_soup1.find('div', {'class': 'cl-empty-listings-summary'}) is None:
            # print(base_url)

            for a in html_soup1.findAll('a', {"data-item-name": "detail-page-link"}):  # sekoj poseben oglas

                r = {}
                url1 = url + a["href"]  # linkot od oglasot
                #print(url1)
                response = get(url1, headers=headers)
                html_soup1 = BeautifulSoup(response.text, 'html.parser')
                tmp = html_soup1.findAll("as24-tracking")
                ID = \
                [elem for elem in tmp if "classified_productID" in str(elem)][0]['as24-tracking-value'].split(":")[1][
                :-1].strip().split("\"")[1]

                w = html_soup1.find('span', {"class": "sc-font-l cldt-stage-primary-keyfact"})
                if w is not None and w.text.split()[0] is not '-':
                    r["Mileage"] = (int(w.text.split()[0].replace(",", "").replace('.', '')))

                w = html_soup1.find('span', {"id": "basicDataFirstRegistrationValue"})
                if w is not None and w.text is not '-':
                    r["First Registration"] = w.text
                    if w.text.split('/')[1] not in list_of_valid_years:
                        print(w.text)
                        print("Older Registration; Ignoring ad!")
                        break

                w = html_soup1.find('span', {"class": "sc-font-m cldt-stage-primary-keyfact"})
                if w is not None and w.text.split()[0] is not '-':
                    r["Power(hp)"] = (int((w.text.replace(',', '').replace('.', '').split()[0])))

                r["Price"] = (
                    html_soup1.find('div', {'class': 'cldt-price'}).find('h2').text.split()[1].split(".")[0].replace(
                        ',', '').replace('.', ''))

                # dodatna oprema
                for a in html_soup1.findAll('div', {
                    "class": "cldt-equipment-block sc-grid-col-3 sc-grid-col-m-4 sc-grid-col-s-12 sc-pull-left"}):
                    for b in a.find_all('span'):
                        r[b.text] = 1

                lst1 = []
                lst2 = []

                for a in html_soup1.findAll('dl'):  # paramentri
                    for b in a.find_all('dt'):
                        lst1.append(b.text)

                for a in html_soup1.findAll('dl'):  # vrednosti
                    for b in a.find_all('dd'):
                        if (b.text.strip() == ""):
                            lst2.append(1)
                        else:
                            lst2.append(b.text.strip())

                for i in range(0, len(lst1)):  # spojuvanje
                    if lst1[i] == 'Displacement':
                        r[lst1[i]] = int(lst2[i].split()[0].replace(",", "").replace('.', ''))
                    elif lst1[i] == 'First Registration':
                        continue
                    else:
                        r[lst1[i]] = lst2[i]

                final = {}
                car_final = {}
                eq_final = {}
                final["ID"] = ID
                car_final["ID"] = ID
                eq_final["AdID"] = ID

                for metric in metrics:
                    if metric in r:
                        final[metric] = r[metric]
                    elif metric == "ID":
                        continue
                    else:
                        final[metric] = None

                for metric in car_metrics:
                    if metric in r:
                        car_final[metric] = r[metric]
                    elif metric == "ID":
                        continue
                    else:
                        car_final[metric] = None

                for metric in eq_metrics:
                    if metric in r:
                        eq_final[metric] = r[metric]
                    elif metric == "AdID":
                        continue
                    else:
                        eq_final[metric] = None

                local_df_metrics = local_df_metrics.append(pd.Series(final), ignore_index=True)
                local_df_car_metrics = local_df_car_metrics.append(pd.Series(eq_final), ignore_index=True)
                local_df_eq_metrics = local_df_eq_metrics.append(pd.Series(car_final), ignore_index=True)

                ad_dict, seller_dict = return_ads_sellers(url1)

                if client.cars_database.sellers.count_documents({'SellerID': int(ad_dict['SellerID'])}) == 0:
                    local_df_seller_metrics = local_df_seller_metrics.append(pd.Series(seller_dict), ignore_index=True)
                else:
                    pass
                   # print("Seller with ID: {0} already exists!".format(ad_dict['SellerID']))


                local_df_ads_metrics = local_df_ads_metrics.append(pd.Series(ad_dict), ignore_index=True)

        else:
            print("Empty query for make {0}".format(make))
            break

    return local_df_metrics, local_df_car_metrics, local_df_eq_metrics, local_df_seller_metrics, local_df_ads_metrics


for make in makes:
    iter_df_metrics, iter_df_car_metrics, iter_df_eq_metrics, iter_df_seller_metrics, iter_df_ads_metrics = baraj(make)
    df_metrics = df_metrics.append(iter_df_metrics, ignore_index=True)
    df_car_metrics = df_car_metrics.append(iter_df_car_metrics, ignore_index=True)
    df_eq_metrics = df_eq_metrics.append(iter_df_eq_metrics, ignore_index=True)
    df_seller_metrics = df_seller_metrics.append(iter_df_seller_metrics, ignore_index=True)
    df_ads_metrics = df_ads_metrics.append(iter_df_ads_metrics, ignore_index=True)
    print("FINISHING")
    
which_files_to_load = 'metrics only'

if which_files_to_load == 'all':
    # list_of_dfs = [df_metrics, df_car_metrics, df_eq_metrics]
    list_of_dfs = []  # empty list for now, no protocols specified for other files
    list_of_ads_dfs = []
    list_of_sellers_dfs = []
else:
    list_of_dfs = [df_metrics]         
    list_of_ads_dfs = [df_ads_metrics]
    list_of_sellers_dfs = [df_seller_metrics]

# TODO: if loading all files, specify loading protocol
for df in list_of_dfs:
    df = df.reset_index()
    df.drop('index', axis=1, inplace=True)
    df['Loaded_in_DW'] = False
    
    #Check for present values in the database
    df_old = pd.DataFrame(list(client.cars_database.cars.find({})))
    df_old.drop('_id', axis=1, inplace=True)
    df_old.columns = old_metrics
    left_join = df.merge(df_old, left_on='ID', right_on = 'ID_old', how = 'left')
    left_join = left_join[left_join['ID_old'].isnull()]
    df = left_join.iloc[:,0:83]
    
    # Relativno posporo resenie od df.to_dict('records'), megjutoa za razlika od nego gi ignorira NaN vrednostite.
    clean_dict_list = df.T.apply(lambda x: x.dropna().to_dict()).tolist()

    if len(clean_dict_list) == 0:
        print("No cars to be inserted")
    else:
        client.cars_database.cars.insert_many(clean_dict_list)
        
#insert ads to mongo
for df in list_of_ads_dfs:
    df = df.reset_index()
    df.drop('index', axis=1, inplace=True)
    df['Loaded_in_DW'] = False
    df_old = pd.DataFrame(list(client.cars_database.ads.find({})))
    df_old.drop('_id', axis=1, inplace=True)
    df_old.columns = old_ads
    df['CarID'] = df['CarID'].astype(int)
    df['SellerID'] = df['SellerID'].astype(int)
    left_join = df.merge(df_old, left_on=["CarID", "SellerID"], right_on =["CarID_old", "SellerID_old"] , how = 'left')
    left_join = left_join[left_join['CarID_old'].isnull()]
    df = left_join.iloc[:,0:6]
    #Relativno posporo resenie od df.to_dict('records'), megjutoa za razlika od nego gi ignorira NaN vrednostite.
    clean_dict_list = df.T.apply(lambda x: x.dropna().to_dict()).tolist()

    if len(clean_dict_list) == 0:
        print("No ads to be inserted")
    else:
        client.cars_database.ads.insert_many(clean_dict_list)

for df in list_of_sellers_dfs:
    df = df.reset_index()
    df.drop('index', axis=1, inplace=True)
    df['Loaded_in_DW'] = False
    df_old = pd.DataFrame(list(client.cars_database.sellers.find({})))
    df_old.drop('_id', axis=1, inplace=True)
    df_old.columns = old_sellers
    df['SellerID'] = df['SellerID'].astype(int)
    left_join = df.merge(df_old, left_on= "SellerID", right_on ="SellerID_old" , how = 'left')
    left_join = left_join[left_join['SellerID_old'].isnull()]    
    df = left_join.iloc[:,0:5]
#  Relativno posporo resenie od df.to_dict('records'), megjutoa za razlika od nego gi ignorira NaN vrednostite.
    clean_dict_list = df.T.apply(lambda x: x.dropna().to_dict()).tolist()
    if len(clean_dict_list) == 0:
        print("No sellers to be inserted")
    else:
        client.cars_database.sellers.insert_many(clean_dict_list)
        
        
print("FINISHED")