From Description Page
In their fourth Kaggle competition, Avito is challenging you to predict demand for an online advertisement based on its full description (title, description, images, etc.), its context (geographically where it was posted, similar ads already posted) and historical demand for similar ads in similar contexts. With this information, Avito can inform sellers on how to best optimize their listing and provide some indication of how much interest they should realistically expect to receive.
18 Variables
- item_id - Ad id.
- user_id - User id.
- region - Ad region.
- city - Ad city.
- parent_category_name - Top level ad category as classified by Avito’s ad model.
- category_name - Fine grain ad category as classified by Avito’s ad model.
- param_1 - Optional parameter from Avito’s ad model.
- param_2 - Optional parameter from Avito’s ad model.
- param_3 - Optional parameter from Avito’s ad model.
- title - Ad title.
- description - Ad description.
- price - Ad price.
- item_seq_number - Ad sequential number for user.
- activation_date- Date ad was placed.
- user_type - User type.
- image - Id code of image. Ties to a jpg file in train_jpg. Not every ad has an image.
- image_top_1 - Avito’s classification code for the image.
- deal_probability - The target variable. This is the likelihood that an ad actually sold something. It’s not possible to verify every transaction with certainty, so this column’s value can be any float from zero to one.
library(tidyverse)
library(knitr)
library(skimr)
library(DT)
library(ggthemes)
library(lubridate)
avi <- read_csv("../input/train.csv")
avite <- read_csv("../input/test.csv")
glimpse(avi)
## Observations: 1,503,424
## Variables: 18
## $ item_id <chr> "b912c3c6a6ad", "2dac0150717d", "ba83aefa...
## $ user_id <chr> "e00f8ff2eaf9", "39aeb48f0017", "91e2f88d...
## $ region <chr> "Свердловская область", "Самарская област...
## $ city <chr> "Екатеринбург", "Самара", "Ростов-на-Дону...
## $ parent_category_name <chr> "Личные вещи", "Для дома и дачи", "Бытова...
## $ category_name <chr> "Товары для детей и игрушки", "Мебель и и...
## $ param_1 <chr> "Постельные принадлежности", "Другое", "В...
## $ param_2 <chr> NA, NA, NA, NA, "ВАЗ (LADA)", NA, NA, "Дж...
## $ param_3 <chr> NA, NA, NA, NA, "2110", NA, NA, "26", "> ...
## $ title <chr> "Кокоби(кокон для сна)", "Стойка для Одеж...
## $ description <chr> "Кокон для сна малыша,пользовались меньше...
## $ price <dbl> 400, 3000, 4000, 2200, 40000, 1300, 11000...
## $ item_seq_number <int> 2, 19, 9, 286, 3, 9, 125, 61, 85, 136, 6,...
## $ activation_date <date> 2017-03-28, 2017-03-26, 2017-03-20, 2017...
## $ user_type <chr> "Private", "Private", "Private", "Company...
## $ image <chr> "d10c7e016e03247a3bf2d13348fe959fe6f436c1...
## $ image_top_1 <dbl> 1008, 692, 3032, 796, 2264, 796, 2823, 56...
## $ deal_probability <dbl> 0.12789, 0.00000, 0.43177, 0.80323, 0.207...
summary(avi)
## item_id user_id region
## Length:1503424 Length:1503424 Length:1503424
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## city parent_category_name category_name
## Length:1503424 Length:1503424 Length:1503424
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## param_1 param_2 param_3
## Length:1503424 Length:1503424 Length:1503424
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## title description price
## Length:1503424 Length:1503424 Min. :0.000e+00
## Class :character Class :character 1st Qu.:5.000e+02
## Mode :character Mode :character Median :1.300e+03
## Mean :3.167e+05
## 3rd Qu.:7.000e+03
## Max. :7.950e+10
## NA's :85362
## item_seq_number activation_date user_type
## Min. : 1.0 Min. :2017-03-15 Length:1503424
## 1st Qu.: 9.0 1st Qu.:2017-03-18 Class :character
## Median : 29.0 Median :2017-03-22 Mode :character
## Mean : 743.7 Mean :2017-03-21
## 3rd Qu.: 88.0 3rd Qu.:2017-03-25
## Max. :204429.0 Max. :2017-04-07
##
## image image_top_1 deal_probability
## Length:1503424 Min. : 0 Min. :0.0000
## Class :character 1st Qu.: 425 1st Qu.:0.0000
## Mode :character Median :1057 Median :0.0000
## Mean :1242 Mean :0.1391
## 3rd Qu.:2217 3rd Qu.:0.1509
## Max. :3066 Max. :1.0000
## NA's :112588
summary(avite)
## item_id user_id region
## Length:508438 Length:508438 Length:508438
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## city parent_category_name category_name
## Length:508438 Length:508438 Length:508438
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## param_1 param_2 param_3
## Length:508438 Length:508438 Length:508438
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## title description price
## Length:508438 Length:508438 Min. :0.000e+00
## Class :character Class :character 1st Qu.:5.000e+02
## Mode :character Mode :character Median :1.500e+03
## Mean :2.798e+05
## 3rd Qu.:8.600e+03
## Max. :3.000e+09
## NA's :30585
## item_seq_number activation_date user_type
## Min. : 1.0 Min. :2017-04-12 Length:508438
## 1st Qu.: 8.0 1st Qu.:2017-04-13 Class :character
## Median : 30.0 Median :2017-04-15 Mode :character
## Mean : 825.1 Mean :2017-04-14
## 3rd Qu.: 94.0 3rd Qu.:2017-04-17
## Max. :205064.0 Max. :2017-04-20
##
## image image_top_1
## Length:508438 Min. : 0
## Class :character 1st Qu.: 467
## Mode :character Median :1132
## Mean :1298
## 3rd Qu.:2218
## Max. :3066
## NA's :42609
head(avi, 50) %>% datatable(filter = 'top',
options = list(pageLength = 10, autoWidth = T))
head(avite, 50) %>% datatable(filter = 'top',
options = list(pageLength = 10, autoWidth = T))
avi_na <- sapply(avi, function(x) sum(is.na(x)))
avite_na <- sapply(avite, function(x) sum(is.na(x)))
avi_na[avi_na >0]; avite_na[avite_na >0]
## param_1 param_2 param_3 description price image
## 61576 654542 862565 116276 85362 112588
## image_top_1
## 112588
## param_1 param_2 param_3 price image image_top_1
## 22910 233229 306331 30585 42609 42609
skim(avi) %>% kable()
## Skim summary statistics
## n obs: 1503424
## n variables: 18
##
## Variable type: Date
##
## variable missing complete n min max median n_unique
## ---------------- -------- --------- -------- ----------- ----------- ----------- ---------
## activation_date 0 1503424 1503424 2017-03-15 2017-04-07 2017-03-22 21
##
## Variable type: character
##
## variable missing complete n min max empty n_unique
## --------------------- -------- --------- -------- ---- ----- ------ ---------
## category_name 0 1503424 1503424 5 28 0 47
## city 0 1503424 1503424 2 24 0 1733
## description 116276 1387148 1503424 1 3212 0 1317102
## image 112588 1390836 1503424 64 64 0 1390836
## item_id 0 1503424 1503424 12 12 0 1503424
## param_1 61576 1441848 1503424 2 29 0 371
## param_2 654542 848882 1503424 1 34 0 271
## param_3 862565 640859 1503424 1 23 0 1219
## parent_category_name 0 1503424 1503424 6 19 0 9
## region 0 1503424 1503424 8 23 0 28
## title 0 1503424 1503424 1 56 0 788377
## user_id 0 1503424 1503424 12 12 0 771769
## user_type 0 1503424 1503424 4 7 0 3
##
## Variable type: integer
##
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## ---------------- -------- --------- -------- ------- -------- --- ---- ---- ---- ------ ---------
## item_seq_number 0 1503424 1503424 743.67 5572.52 1 9 29 88 2e+05 ▇▁▁▁▁▁▁▁
##
## Variable type: numeric
##
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## ----------------- -------- --------- -------- ---------- -------- --- ---- ----- ----- ------ ---------
## deal_probability 0 1503424 1503424 0.14 0.26 0 0 0 0.15 1 ▇▁▁▁▁▁▁▁
## image_top_1 112588 1390836 1503424 1241.93 970.46 0 425 1057 2217 3066 ▇▇▅▅▂▅▂▅
## price 85362 1418062 1503424 316708.09 6.7e+07 0 500 1300 7000 8e+10 ▇▁▁▁▁▁▁▁
paste("All item_id are unique?", length(unique(avi$item_id)) == nrow(avi))
## [1] "All item_id are unique? TRUE"
paste("All item_id are unique?", length(unique(avite$item_id)) == nrow(avite))
## [1] "All item_id are unique? TRUE"
all unique item_id
length(unique(avi$user_id)) / nrow(avi)
## [1] 0.5133409
length(unique(avite$user_id)) / nrow(avi)
## [1] 0.2035813
avi %>% group_by(user_id) %>% summarise(Count=n()) %>%
arrange(desc(Count)) %>% head(20) %>%
ggplot(aes(x=reorder(user_id, Count), y=Count)) +
geom_col(fill="steelblue") + coord_flip() +
labs(x="User ID", y="Count", title = "Most Popular User ID")
region <- c("Краснодарский край","Свердловская область",
"Ростовская область","Татарстан","Челябинская область",
"Нижегородская область","Самарская область",
"Башкортостан","Пермский край","Новосибирская область",
"Ставропольский край","Ханты-Мансийский АО",
"Воронежская область","Иркутская область",
"Тульская область","Тюменская область","Белгородская область")
region_en <- c("Krasnodar","Sverdlovsk","Rostov","Tatarstan",
"Chelyabinsk","Nizhny Novgorod","Samara",
"Bashkortostan","Perm","Novosibirsk",
"Stavropol","Khanty-Mansiysk Autonomous Okrug",
"Voronezh","Irkutsk","Tula","Tyumen","Belgorod")
df_regions_en <- as.data.frame(cbind(region,region_en))
avi %>% group_by(region) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_regions_en) %>%
mutate(region_en = reorder(region_en, Count)) %>%
ggplot(aes(x=region_en, y=Count)) +
geom_col() + coord_flip() + theme_wsj() +
geom_text(aes(x=region_en, y = 1, label=paste(round((Count/nrow(avi))*100,2), "%")),
hjust = 0, vjust =.5, fontface = 'bold', color="orange") +
labs(x='region', y='count', title = 'Most Popular Region')
avite %>% group_by(region) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_regions_en) %>%
mutate(region_en = reorder(region_en, Count)) %>%
ggplot(aes(x=region_en, y=Count)) +
geom_col() + coord_flip() + theme_wsj() +
geom_text(aes(x=region_en, y = 1, label=paste(round((Count/nrow(avi))*100,2), "%")),
hjust = 0, vjust =.5, fontface = 'bold', color="orange") +
labs(x='region', y='count', title = 'Most Popular Region')
region_dt <- avi %>% group_by(region) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10)
avi %>% filter(region %in% region_dt$region) %>%
ggplot(aes(x=factor(region), y=deal_probability, fill= region)) +
geom_boxplot() + theme_bw() +
labs(x='Region', y="Deal Probablity", title="Distribution of Deal Probablity") +
theme(axis.text.x = element_text(angle=90, hjust = 1))
city <-c("Краснодар","Екатеринбург","Новосибирск","Ростов-на-Дону","Нижний Новгород",
"Челябинск","Пермь","Казань","Самара","Омск")
city_en <-c("Krasnodar","Ekaterinburg","Novosibirsk","Rostov-na-Donu",
"Nizhny Novgorod", "Chelyabinsk","Permian","Kazan","Samara","Omsk")
df_city_en <- as.data.frame(cbind(city,city_en) )
avi %>% group_by(city) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_city_en) %>%
mutate(city_en = reorder(city_en, Count)) %>%
ggplot(aes(x=city_en, y=Count)) +
labs(x='City', y= 'Count', title = 'Most Popular City') +
geom_col() + coord_flip() + theme_wsj()
avi %>% group_by(city) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_city_en) %>% datatable()
parent_category_name <- c("Личные вещи","Для дома и дачи",
"Бытовая электроника","Недвижимость",
"Хобби и отдых","Транспорт",
"Услуги","Животные","Для бизнеса")
parent_category_name_en <- c("Personal things","home and cottages",
"Consumer electronics","Property",
"Hobbies and Recreation","Transport",
"services","Animals","business")
df_parentcategory_en <- as.data.frame(cbind(parent_category_name,parent_category_name_en ) )
avi %>% group_by(parent_category_name) %>% summarise(Count = n()) %>%
left_join(df_parentcategory_en) %>% arrange(desc(Count)) %>%
ggplot(aes(x=reorder(parent_category_name_en, Count), y=Count)) +
geom_col(fill="lightblue") + coord_flip() + theme_bw() +
labs(x="Parent Category", y="Count", title = "Most Popular Parent Category") +
geom_text(aes(x=parent_category_name_en, y = 5000,
label= paste(round(Count*100/nrow(avi),1), "%") ),
hjust=0, vjust =.5, fontface='bold')
avi %>% group_by(parent_category_name) %>% summarise(Count = n()) %>%
left_join(df_parentcategory_en) %>% arrange(desc(Count)) %>%
ggplot(aes(x=reorder(parent_category_name_en, Count), y=Count)) +
geom_col(fill="lightblue") + coord_flip() + theme_bw() +
labs(x="Parent Category", y="Count", title = "Most Popular Parent Category") +
geom_text(aes(x=parent_category_name_en, y = 5000, label= paste(round(Count*100/nrow(avi),1), "%") ),
hjust=0, vjust =.5, fontface='bold') +scale_y_log10()
avi %>% group_by(parent_category_name) %>% summarise(Count = n()) %>%
left_join(df_parentcategory_en) %>% arrange(desc(Count)) %>% datatable()
category_name <- c("Одежда, обувь, аксессуары",
"Детская одежда и обувь",
"Товары для детей и игрушки",
"Квартиры",
"Телефоны",
"Мебель и интерьер",
"Предложение услуг",
"Автомобили",
"Ремонт и строительство",
"Бытовая техника",
"Недвижимость за рубежом",
"Дома, дачи, коттеджи",
"Земельные участки",
"Комнаты",
"Грузовики и спецтехника",
"Готовый бизнес",
"Гаражи и машиноместа",
"Коммерческая недвижимость")
category_name_en <- c("Clothes,shoes accessories" ,
"Children's clothing and footwear" ,
"Goods for children and toys" ,
"Apartments" ,
"Phones",
"Furniture and interior",
"Offer of services",
"Cars",
"Repair and construction",
"Appliances",
"Property Abroad",
"Houses, cottages, cottages",
"Land",
"Rooms",
"Trucks and special equipment",
"Ready business",
"Garages and parking places",
"Commercial Property")
df_category_en <- as.data.frame(cbind(category_name,category_name_en ) )
avi %>% group_by(category_name) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_category_en) %>%
ggplot(aes(x=reorder(category_name_en, Count), y=Count)) +
geom_col(fill = "lightblue") + coord_flip() + theme_bw() +
labs(x='Category', y='Count', title='Most Popular Category') +
geom_text(aes(x=category_name_en, y= 5000,
label= paste(round(Count*100/nrow(avi),1), "%")),
hjust = 0, vjust=.5, fontface='bold')
avi %>% group_by(category_name) %>% summarise(Count = n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_category_en) %>% datatable()
param_1 <- c("Женская одежда","Для девочек","Для мальчиков",
"Продам","С пробегом","Аксессуары",
"Мужская одежда","Другое",
"Игрушки","Детские коляски")
param_1_en <- c("Women's clothing","For girls","For boys",
"Selling","With mileage","Accessories","Men's clothing","Other",
"Toys","Baby carriages")
df_param_1_en <- as.data.frame(cbind(param_1,param_1_en ) )
avi %>% filter(!is.na(param_1)) %>% group_by(param_1) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>%
left_join(df_param_1_en) %>%
ggplot(aes(x = reorder(param_1_en, Count), y= Count)) +
geom_col(fill='lightblue') + coord_flip() + theme_bw() +
labs(x = 'Param_1', y = 'Count', title = 'Most Poupular Param 1') +
geom_text(aes(x=param_1_en, y = 10000,
label = paste(round(Count*100/nrow(avi),1), "%")),
hjust=0, vjust=.5, fontface='bold')
avi %>% filter(!is.na(param_1)) %>% group_by(param_1) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>%
left_join(df_param_1_en) %>% datatable()
param_2 <- c("Обувь","Верхняя одежда",
"Платья и юбки","Другое",
"Трикотаж","Брюки",
"1","2","На длительный срок","Дом")
param_2_en <- c("Footwear","Outerwear","Dresses and skirts","Other",
"Knitwear","Pants","1","2","For a long time","House")
df_param_2_en <- as.data.frame(cbind(param_2,param_2_en ) )
avi %>% filter(!is.na(param_2)) %>% group_by(param_2) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>%
left_join(df_param_2_en) %>%
ggplot(aes(x=reorder(param_2_en, Count), y = Count)) +
geom_col(fill='lightblue') + coord_flip() + theme_bw() +
labs(x='Param_2', y='Count', title='Most Popular Param 2') +
geom_text(aes(x=param_2_en, y=10000,
label = paste(round(Count*100/nrow(avi),1), "%")),
hjust = 0, vjust=.5, fontface = 'bold')
avi %>% filter(!is.na(param_2)) %>% group_by(param_2) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>%
left_join(df_param_2_en) %>% datatable()
avi %>% filter(!is.na(param_3)) %>% group_by(param_3) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>%
ggplot(aes(x=reorder(param_3, Count), y=Count)) +
geom_col(fill='lightblue') + coord_flip() + theme_bw() +
labs(x='Param 3', y='Count', title='Most Popular Param 3') +
geom_text(aes(x=param_3, y=5000,
label = paste(round(Count*100/nrow(avi),1) , "%")),
hjust=0, vjust=.5, fontface='bold')
avi %>% filter(!is.na(param_3)) %>% group_by(param_3) %>%
summarise(Count = n()) %>% arrange(desc(Count)) %>% head(10) %>% datatable()
Memory Limits Problem on Windows
title <- c("Платье","Туфли","Куртка",
"Пальто","Джинсы","Комбинезон",
"Кроссовки","Костюм","Ботинки",
"Босоножки")
title_en <- c("Dress","Shoes","Jacket",
"Coat","Jeans","Overalls",
"Sneakers","Costume","Boots",
"Sandals")
df_title_en <- as.data.frame(cbind(title,title_en) )
avi %>% group_by(title) %>% summarise(Count=n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_title_en) %>%
ggplot(aes(x=reorder(title_en, Count), y=Count)) +
geom_col(fill='lightblue') + coord_flip() + theme_bw() +
labs(x='Title', y='Count', title='Most Popular Title') +
geom_text(aes(x=title_en, y=2000,
label = paste(round(Count*100/nrow(avi),1) , "%")),
hjust=0, vjust=.5, fontface='bold')
avi %>% group_by(title) %>% summarise(Count=n()) %>%
arrange(desc(Count)) %>% head(10) %>% left_join(df_title_en) %>% kable()
title | Count | title_en |
---|---|---|
Платье 15550 | Dress | |
Туфли 633 | 4 Shoes | |
Куртка 6331 | Jacket | |
Пальто 5251 | Coat | |
Джинсы 4758 | Jeans | |
Комбинезон 4506 Ov | eralls | |
Кроссовки 3196 S | neakers | |
Костюм 2916 | Costum | e |
Ботинки 2876 | Boots | |
Босоножки 2760 S | andals |
avi %>% mutate(title_len = str_count(title)) %>%
ggplot(aes(x=title_len)) + geom_histogram(bins = 30, fill='lightblue') +
labs(x='Title Length', y='Count', title='train : Distibution of Title Length')
avite %>% mutate(title_len = str_count(title)) %>%
ggplot(aes(x=title_len)) + geom_histogram(bins = 30, fill='lightblue') +
labs(x='Title Length', y='Count', title='test : Distibution of Title Length')
avi %>% mutate(des_len = str_count(description)) %>%
filter(des_len < 1000) %>%
ggplot(aes(x=des_len)) + geom_histogram(bins=30, fill = 'lightblue')
avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price)) + geom_histogram(bins=50)
avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price)) +
geom_histogram(bins=50, fill = 'steelblue') +
labs(title="Price Log Graph") +
scale_x_log10() + scale_y_log10() + theme_wsj()
avi %>% filter(!is.na(price)) %>% ggplot(aes(x=price, fill=user_type)) +
geom_histogram(bins=50) +
labs(x='price', y='count', title="Price Log Graph") +
scale_x_log10() + scale_y_log10() + theme_wsj()
avi %>% filter(!is.na(price)) %>% left_join(df_parentcategory_en) %>%
ggplot(aes(x=price, fill=parent_category_name_en)) +
geom_histogram(bins=50) +
labs(x='price', y='count', title="Price Log Graph") +
scale_x_log10() + scale_y_log10() + theme_wsj()
avi %>% ggplot(aes(x=item_seq_number)) + geom_histogram(bins = 30)
avi %>% ggplot(aes(x=item_seq_number)) +
geom_histogram(bins = 30) + scale_x_log10()
avi %>% ggplot(aes(x=item_seq_number, fill=user_type)) +
geom_histogram(bins = 30) + scale_x_log10()
avi %>% mutate(day = day(activation_date)) %>% group_by(day) %>%
summarise(Count = n()) %>%
ggplot(aes(x=day, y=Count)) + geom_col()
avi %>% mutate(day = day(activation_date)) %>% group_by(day) %>%
summarise(Count = n()) %>%
datatable(filter = "top", options = list(pageLength = 30, autoWidth= T))
avite %>% mutate(day = day(activation_date)) %>% group_by(day) %>%
summarise(Count = n()) %>%
datatable(filter = "top", options = list(pageLength = 30, autoWidth= T))
avi %>% mutate(wday = wday(activation_date, labe=T, locale="UK")) %>%
group_by(wday) %>% summarise(Count=n()) %>%
ggplot(aes(x=wday, y=Count)) +
geom_col(fill='lightblue', col='white') + theme_bw() +
labs(x="Activation Weekday", y="Count", title="Activation Weekday")
avi %>% group_by(user_type) %>% summarise(Count = n()) %>%
ggplot(aes(x=user_type, y=Count)) +
geom_col(color='orange', fill = 'lightblue') +
labs(x='User Type', y='Count', title = 'User Type') +
geom_text(aes(x=user_type, y= 50000,
label = paste(round(Count*100/nrow(avi)), "%")),
size = 5, fontface='bold')
avite %>% group_by(user_type) %>% summarise(Count = n()) %>%
ggplot(aes(x=user_type, y=Count)) +
geom_col(color='orange', fill = 'lightblue') +
labs(x='User Type', y='Count', title = 'User Type') +
geom_text(aes(x=user_type, y= 15000,
label = paste(round(Count*100/nrow(avite)), "%")),
size = 5, fontface='bold')
avi$image_codeYN <- ifelse(is.na(avi$image), 0, 1)
avi %>% group_by(image_codeYN) %>% summarise(Count = n()) %>%
ggplot(aes(x=factor(image_codeYN), y=Count)) +
geom_col(fill='lightblue') + theme_wsj() +
labs(x="image_codeYN", y="Count", title="Image Code Y/N Count") +
geom_text(aes(x=factor(image_codeYN), y=500000,
label= paste0(Count, "\n\n\n",
round(Count*100/nrow(avi),1), "%")))
avite$image_codeYN <- ifelse(is.na(avite$image), 0, 1)
avite %>% group_by(image_codeYN) %>% summarise(Count = n()) %>%
ggplot(aes(x=factor(image_codeYN), y=Count)) +
geom_col(fill='lightblue') + theme_wsj() +
labs(x="image_codeYN", y="Count", title="Image Code Y/N Count") +
geom_text(aes(x=factor(image_codeYN), y=100000,
label= paste0(Count, "\n\n\n",
round(Count*100/nrow(avi),1), "%")))
ggplot(data=avi, aes(x=factor(image_codeYN), y=deal_probability)) +
geom_boxplot(fill=c("lightgreen", "lightblue")) + theme_wsj() +
labs(x="image_codeYN", y="Deal Probability",
title = "Distribution of Deal Probability on ImageCode")
ggplot(avi, aes(x=image_top_1)) + geom_histogram(bins = 30)
avi %>% ggplot(aes(x=deal_probability)) +
geom_histogram(bins=30, fill = 'lightblue') + theme_wsj() +
labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")
avi %>% ggplot(aes(x=deal_probability)) +
geom_histogram(bins=10, fill = 'lightblue') + theme_wsj() +
labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")
avi %>% ggplot(aes(x=deal_probability)) +
geom_histogram(bins=5, fill = 'lightblue') + theme_wsj() +
labs(x="Deal Probability", y= "Count", title = "Distribution of Deal Probability")