1 Introduction

From Description Page

In their fourth Kaggle competition, Avito is challenging you to predict demand for an online advertisement based on its full description (title, description, images, etc.), its context (geographically where it was posted, similar ads already posted) and historical demand for similar ads in similar contexts. With this information, Avito can inform sellers on how to best optimize their listing and provide some indication of how much interest they should realistically expect to receive.


18 Variables

  • item_id - Ad id.
  • user_id - User id.
  • region - Ad region.
  • city - Ad city.
  • parent_category_name - Top level ad category as classified by Avito’s ad model.
  • category_name - Fine grain ad category as classified by Avito’s ad model.
  • param_1 - Optional parameter from Avito’s ad model.
  • param_2 - Optional parameter from Avito’s ad model.
  • param_3 - Optional parameter from Avito’s ad model.
  • title - Ad title.
  • description - Ad description.
  • price - Ad price.
  • item_seq_number - Ad sequential number for user.
  • activation_date- Date ad was placed.
  • user_type - User type.
  • image - Id code of image. Ties to a jpg file in train_jpg. Not every ad has an image.
  • image_top_1 - Avito’s classification code for the image.
  • deal_probability - The target variable. This is the likelihood that an ad actually sold something. It’s not possible to verify every transaction with certainty, so this column’s value can be any float from zero to one.

2 Library & Importing

2.1 Loading Library

library(tidyverse)
library(knitr)
library(skimr)
library(DT)
library(ggthemes)
library(lubridate)

2.2 Importing Data

avi <- read_csv("../input/train.csv")
avite <- read_csv("../input/test.csv")

3 Glimpse of Data

3.1 Glimpse of train

glimpse(avi)
## Observations: 1,503,424
## Variables: 18
## $ item_id              <chr> "b912c3c6a6ad", "2dac0150717d", "ba83aefa...
## $ user_id              <chr> "e00f8ff2eaf9", "39aeb48f0017", "91e2f88d...
## $ region               <chr> "Свердловская область", "Самарская област...
## $ city                 <chr> "Екатеринбург", "Самара", "Ростов-на-Дону...
## $ parent_category_name <chr> "Личные вещи", "Для дома и дачи", "Бытова...
## $ category_name        <chr> "Товары для детей и игрушки", "Мебель и и...
## $ param_1              <chr> "Постельные принадлежности", "Другое", "В...
## $ param_2              <chr> NA, NA, NA, NA, "ВАЗ (LADA)", NA, NA, "Дж...
## $ param_3              <chr> NA, NA, NA, NA, "2110", NA, NA, "26", "> ...
## $ title                <chr> "Кокоби(кокон для сна)", "Стойка для Одеж...
## $ description          <chr> "Кокон для сна малыша,пользовались меньше...
## $ price                <dbl> 400, 3000, 4000, 2200, 40000, 1300, 11000...
## $ item_seq_number      <int> 2, 19, 9, 286, 3, 9, 125, 61, 85, 136, 6,...
## $ activation_date      <date> 2017-03-28, 2017-03-26, 2017-03-20, 2017...
## $ user_type            <chr> "Private", "Private", "Private", "Company...
## $ image                <chr> "d10c7e016e03247a3bf2d13348fe959fe6f436c1...
## $ image_top_1          <dbl> 1008, 692, 3032, 796, 2264, 796, 2823, 56...
## $ deal_probability     <dbl> 0.12789, 0.00000, 0.43177, 0.80323, 0.207...

3.2 Summary of train

summary(avi)
##    item_id            user_id             region         
##  Length:1503424     Length:1503424     Length:1503424    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      city           parent_category_name category_name     
##  Length:1503424     Length:1503424       Length:1503424    
##  Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character     Mode  :character  
##                                                            
##                                                            
##                                                            
##                                                            
##    param_1            param_2            param_3         
##  Length:1503424     Length:1503424     Length:1503424    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     title           description            price          
##  Length:1503424     Length:1503424     Min.   :0.000e+00  
##  Class :character   Class :character   1st Qu.:5.000e+02  
##  Mode  :character   Mode  :character   Median :1.300e+03  
##                                        Mean   :3.167e+05  
##                                        3rd Qu.:7.000e+03  
##                                        Max.   :7.950e+10  
##                                        NA's   :85362      
##  item_seq_number    activation_date       user_type        
##  Min.   :     1.0   Min.   :2017-03-15   Length:1503424    
##  1st Qu.:     9.0   1st Qu.:2017-03-18   Class :character  
##  Median :    29.0   Median :2017-03-22   Mode  :character  
##  Mean   :   743.7   Mean   :2017-03-21                     
##  3rd Qu.:    88.0   3rd Qu.:2017-03-25                     
##  Max.   :204429.0   Max.   :2017-04-07                     
##                                                            
##     image            image_top_1     deal_probability
##  Length:1503424     Min.   :   0     Min.   :0.0000  
##  Class :character   1st Qu.: 425     1st Qu.:0.0000  
##  Mode  :character   Median :1057     Median :0.0000  
##                     Mean   :1242     Mean   :0.1391  
##                     3rd Qu.:2217     3rd Qu.:0.1509  
##                     Max.   :3066     Max.   :1.0000  
##                     NA's   :112588

3.3 Summary of test

summary(avite)
##    item_id            user_id             region         
##  Length:508438      Length:508438      Length:508438     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      city           parent_category_name category_name     
##  Length:508438      Length:508438        Length:508438     
##  Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character     Mode  :character  
##                                                            
##                                                            
##                                                            
##                                                            
##    param_1            param_2            param_3         
##  Length:508438      Length:508438      Length:508438     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     title           description            price          
##  Length:508438      Length:508438      Min.   :0.000e+00  
##  Class :character   Class :character   1st Qu.:5.000e+02  
##  Mode  :character   Mode  :character   Median :1.500e+03  
##                                        Mean   :2.798e+05  
##                                        3rd Qu.:8.600e+03  
##                                        Max.   :3.000e+09  
##                                        NA's   :30585      
##  item_seq_number    activation_date       user_type        
##  Min.   :     1.0   Min.   :2017-04-12   Length:508438     
##  1st Qu.:     8.0   1st Qu.:2017-04-13   Class :character  
##  Median :    30.0   Median :2017-04-15   Mode  :character  
##  Mean   :   825.1   Mean   :2017-04-14                     
##  3rd Qu.:    94.0   3rd Qu.:2017-04-17                     
##  Max.   :205064.0   Max.   :2017-04-20                     
##                                                            
##     image            image_top_1   
##  Length:508438      Min.   :   0   
##  Class :character   1st Qu.: 467   
##  Mode  :character   Median :1132   
##                     Mean   :1298   
##                     3rd Qu.:2218   
##                     Max.   :3066   
##                     NA's   :42609

4 LookUp & NA

4.1 Head train

head(avi, 50) %>% datatable(filter = 'top', 
                            options = list(pageLength = 10, autoWidth = T))

4.2 Head test

head(avite, 50) %>% datatable(filter = 'top', 
                            options = list(pageLength = 10, autoWidth = T))

4.3 NA

avi_na <- sapply(avi, function(x) sum(is.na(x)))
avite_na <- sapply(avite, function(x) sum(is.na(x)))
avi_na[avi_na >0]; avite_na[avite_na >0]
##     param_1     param_2     param_3 description       price       image 
##       61576      654542      862565      116276       85362      112588 
## image_top_1 
##      112588
##     param_1     param_2     param_3       price       image image_top_1 
##       22910      233229      306331       30585       42609       42609

5 Variables

skim(avi) %>% kable()
## Skim summary statistics  
##  n obs: 1503424    
##  n variables: 18    
## 
## Variable type: Date
## 
## variable          missing   complete   n         min          max          median       n_unique 
## ----------------  --------  ---------  --------  -----------  -----------  -----------  ---------
## activation_date   0         1503424    1503424   2017-03-15   2017-04-07   2017-03-22   21       
## 
## Variable type: character
## 
## variable               missing   complete   n         min   max    empty   n_unique 
## ---------------------  --------  ---------  --------  ----  -----  ------  ---------
## category_name          0         1503424    1503424   5     28     0       47       
## city                   0         1503424    1503424   2     24     0       1733     
## description            116276    1387148    1503424   1     3212   0       1317102  
## image                  112588    1390836    1503424   64    64     0       1390836  
## item_id                0         1503424    1503424   12    12     0       1503424  
## param_1                61576     1441848    1503424   2     29     0       371      
## param_2                654542    848882     1503424   1     34     0       271      
## param_3                862565    640859     1503424   1     23     0       1219     
## parent_category_name   0         1503424    1503424   6     19     0       9        
## region                 0         1503424    1503424   8     23     0       28       
## title                  0         1503424    1503424   1     56     0       788377   
## user_id                0         1503424    1503424   12    12     0       771769   
## user_type              0         1503424    1503424   4     7      0       3        
## 
## Variable type: integer
## 
## variable          missing   complete   n         mean     sd        p0   p25   p50   p75   p100    hist     
## ----------------  --------  ---------  --------  -------  --------  ---  ----  ----  ----  ------  ---------
## item_seq_number   0         1503424    1503424   743.67   5572.52   1    9     29    88    2e+05   ▇▁▁▁▁▁▁▁ 
## 
## Variable type: numeric
## 
## variable           missing   complete   n         mean        sd        p0   p25   p50    p75    p100    hist     
## -----------------  --------  ---------  --------  ----------  --------  ---  ----  -----  -----  ------  ---------
## deal_probability   0         1503424    1503424   0.14        0.26      0    0     0      0.15   1       ▇▁▁▁▁▁▁▁ 
## image_top_1        112588    1390836    1503424   1241.93     970.46    0    425   1057   2217   3066    ▇▇▅▅▂▅▂▅ 
## price              85362     1418062    1503424   316708.09   6.7e+07   0    500   1300   7000   8e+10   ▇▁▁▁▁▁▁▁

5.1 item_id

5.1.1 train

paste("All item_id are unique?", length(unique(avi$item_id)) == nrow(avi))
## [1] "All item_id are unique? TRUE"

5.1.2 test

paste("All item_id are unique?", length(unique(avite$item_id)) == nrow(avite))
## [1] "All item_id are unique? TRUE"

all unique item_id

5.2 user_id

5.2.1 train unique proportion

length(unique(avi$user_id)) / nrow(avi)
## [1] 0.5133409

5.2.2 test unique proportion

length(unique(avite$user_id)) / nrow(avi)
## [1] 0.2035813

5.3 region

5.3.1 in train

region <- c("Краснодарский край","Свердловская область", 
            "Ростовская область","Татарстан","Челябинская область",
            "Нижегородская область","Самарская область",
            "Башкортостан","Пермский край","Новосибирская область",
            "Ставропольский край","Ханты-Мансийский АО",
            "Воронежская область","Иркутская область",
            "Тульская область","Тюменская область","Белгородская область")
region_en <- c("Krasnodar","Sverdlovsk","Rostov","Tatarstan", 
               "Chelyabinsk","Nizhny Novgorod","Samara", 
               "Bashkortostan","Perm","Novosibirsk", 
               "Stavropol","Khanty-Mansiysk Autonomous Okrug", 
               "Voronezh","Irkutsk","Tula","Tyumen","Belgorod")

df_regions_en <- as.data.frame(cbind(region,region_en))

avi %>% group_by(region) %>% summarise(Count = n()) %>%
  arrange(desc(Count)) %>% head(10) %>% left_join(df_regions_en) %>%
  mutate(region_en = reorder(region_en, Count)) %>%
  ggplot(aes(x=region_en, y=Count)) + 
  geom_col() + coord_flip() + theme_wsj() +
  geom_text(aes(x=region_en, y = 1, label=paste(round((Count/nrow(avi))*100,2), "%")), 
            hjust = 0, vjust =.5,  fontface = 'bold', color="orange") + 
  labs(x='region', y='count', title = 'Most Popular Region')

5.3.2 in test

avite %>% group_by(region) %>% summarise(Count = n()) %>%
  arrange(desc(Count)) %>% head(10) %>% left_join(df_regions_en) %>%
  mutate(region_en = reorder(region_en, Count)) %>%
  ggplot(aes(x=region_en, y=Count)) + 
  geom_col() + coord_flip() + theme_wsj() +
  geom_text(aes(x=region_en, y = 1, label=paste(round((Count/nrow(avi))*100,2), "%")), 
            hjust = 0, vjust =.5,  fontface = 'bold', color="orange") + 
  labs(x='region', y='count', title = 'Most Popular Region')

5.3.3 Distribution of Deal Probablity

region_dt <- avi %>% group_by(region) %>% summarise(Count = n()) %>% 
  arrange(desc(Count)) %>% head(10)
avi %>% filter(region %in% region_dt$region) %>%
  ggplot(aes(x=factor(region), y=deal_probability, fill= region)) +
  geom_boxplot() + theme_bw() + 
  labs(x='Region', y="Deal Probablity", title="Distribution of Deal Probablity") + 
  theme(axis.text.x = element_text(angle=90, hjust = 1))

5.4 city

5.5 parent_category_name

5.6 category_name

5.7 param_1

5.8 param_2

5.9 param_3

5.10 title

Memory Limits Problem on Windows

5.10.3 Title Length

avi %>% mutate(title_len = str_count(title)) %>% 
  ggplot(aes(x=title_len)) + geom_histogram(bins = 30, fill='lightblue') +
  labs(x='Title Length', y='Count', title='train : Distibution of Title Length')

avite %>% mutate(title_len = str_count(title)) %>% 
  ggplot(aes(x=title_len)) + geom_histogram(bins = 30, fill='lightblue') +
  labs(x='Title Length', y='Count', title='test : Distibution of Title Length')

5.11 description

avi %>% mutate(des_len = str_count(description)) %>% 
  filter(des_len < 1000) %>% 
  ggplot(aes(x=des_len)) + geom_histogram(bins=30, fill = 'lightblue')

5.12 price

5.12.1 Basic & Log

avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price)) + geom_histogram(bins=50)

avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price)) + 
  geom_histogram(bins=50, fill = 'steelblue')  + 
  labs(title="Price Log Graph") + 
  scale_x_log10() + scale_y_log10() + theme_wsj()

5.12.2 with UserType

avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price, fill=user_type)) + 
  geom_histogram(bins=50)  + 
  labs(x='price', y='count', title="Price Log Graph") + 
  scale_x_log10() + scale_y_log10() + theme_wsj()

5.12.3 with Parent Category

avi %>% filter(!is.na(price)) %>% left_join(df_parentcategory_en) %>% 
  ggplot(aes(x=price, fill=parent_category_name_en)) + 
  geom_histogram(bins=50)  + 
  labs(x='price', y='count', title="Price Log Graph") + 
  scale_x_log10() + scale_y_log10() + theme_wsj()

5.13 item_seq_number

5.13.1 Basic & Log

avi %>% ggplot(aes(x=item_seq_number)) + geom_histogram(bins = 30)

avi %>% ggplot(aes(x=item_seq_number)) + 
  geom_histogram(bins = 30) + scale_x_log10()

5.13.2 with User Type

avi %>% ggplot(aes(x=item_seq_number, fill=user_type)) + 
  geom_histogram(bins = 30) + scale_x_log10()

5.14 activation_date

5.14.1 Graph

avi %>% mutate(day = day(activation_date)) %>% group_by(day) %>% 
  summarise(Count = n()) %>% 
  ggplot(aes(x=day, y=Count)) + geom_col()

5.14.2 train DT

avi %>% mutate(day = day(activation_date)) %>% group_by(day) %>% 
  summarise(Count = n()) %>% 
  datatable(filter = "top", options = list(pageLength = 30, autoWidth= T))

### test DT

avite %>% mutate(day = day(activation_date)) %>% group_by(day) %>% 
  summarise(Count = n()) %>% 
  datatable(filter = "top", options = list(pageLength = 30, autoWidth= T))

5.14.3 weekday

avi %>% mutate(wday = wday(activation_date, labe=T, locale="UK")) %>% 
  group_by(wday) %>% summarise(Count=n()) %>%
  ggplot(aes(x=wday, y=Count)) + 
  geom_col(fill='lightblue', col='white') + theme_bw() + 
  labs(x="Activation Weekday", y="Count", title="Activation Weekday")

5.15 user_type

5.15.1 on train

avi %>% group_by(user_type) %>% summarise(Count = n()) %>%
  ggplot(aes(x=user_type, y=Count)) + 
  geom_col(color='orange', fill = 'lightblue') + 
  labs(x='User Type', y='Count', title = 'User Type') + 
  geom_text(aes(x=user_type, y= 50000, 
                label = paste(round(Count*100/nrow(avi)), "%")), 
            size = 5, fontface='bold')

5.15.2 on test

avite %>% group_by(user_type) %>% summarise(Count = n()) %>%
  ggplot(aes(x=user_type, y=Count)) + 
  geom_col(color='orange', fill = 'lightblue') + 
  labs(x='User Type', y='Count', title = 'User Type') + 
  geom_text(aes(x=user_type, y= 15000, 
                label = paste(round(Count*100/nrow(avite)), "%")), 
            size = 5, fontface='bold')

5.16 image

5.16.1 image on train

avi$image_codeYN <- ifelse(is.na(avi$image), 0, 1)

avi %>% group_by(image_codeYN) %>% summarise(Count = n()) %>% 
  ggplot(aes(x=factor(image_codeYN), y=Count)) + 
  geom_col(fill='lightblue') + theme_wsj() + 
  labs(x="image_codeYN", y="Count", title="Image Code Y/N Count") + 
  geom_text(aes(x=factor(image_codeYN), y=500000, 
                label= paste0(Count, "\n\n\n", 
                              round(Count*100/nrow(avi),1), "%")))

5.16.2 image on test

avite$image_codeYN <- ifelse(is.na(avite$image), 0, 1)
avite %>% group_by(image_codeYN) %>% summarise(Count = n()) %>% 
  ggplot(aes(x=factor(image_codeYN), y=Count)) + 
  geom_col(fill='lightblue') + theme_wsj() + 
  labs(x="image_codeYN", y="Count", title="Image Code Y/N Count") + 
  geom_text(aes(x=factor(image_codeYN), y=100000, 
                label= paste0(Count, "\n\n\n", 
                              round(Count*100/nrow(avi),1), "%")))

5.16.3 with Deal Probability

ggplot(data=avi, aes(x=factor(image_codeYN), y=deal_probability)) +
  geom_boxplot(fill=c("lightgreen", "lightblue")) + theme_wsj() + 
  labs(x="image_codeYN", y="Deal Probability", 
       title = "Distribution of Deal Probability on ImageCode")

5.17 image_top_1

ggplot(avi, aes(x=image_top_1)) + geom_histogram(bins = 30)

5.18 deal_probability

5.18.1 bin 30

avi %>% ggplot(aes(x=deal_probability)) + 
  geom_histogram(bins=30, fill = 'lightblue') + theme_wsj() + 
  labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")

5.18.2 bin 10

avi %>% ggplot(aes(x=deal_probability)) + 
  geom_histogram(bins=10, fill = 'lightblue') + theme_wsj() + 
  labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")

5.18.3 bin 5

avi %>% ggplot(aes(x=deal_probability)) + 
  geom_histogram(bins=5, fill = 'lightblue') + theme_wsj() + 
  labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")