Booking.com European landscape analysis
Importing the Data Set
Story Time
As a team member of Data Analytics Team at Booking.com, I have been asked to gather insights on the mid- to high-end (luxury) hotel market in Europe. I am to present my results to my Team Lead.
#`## Cleaning Data Set global_hotel_chain_sizec
First, let’s have a look at the overall global luxury & upscale hotel market.
# let's have a look on our data set
#describe(global_hotel_chain_size)
#Remove empty columns and rows
global_hotel_chain_size_clean<-remove_empty(global_hotel_chain_size, which = c("rows","cols"))
#describe(global_hotel_chain_size_clean)
Cleaning Data Set hotels_raw
glimpse(hotels_raw)
## Rows: 515,738
## Columns: 17
## $ hotel_address <chr> "s Gravesandestraat 55 Oos…
## $ additional_number_of_scoring <dbl> 194, 194, 194, 194, 194, 1…
## $ review_date <chr> "8/3/2017", "8/3/2017", "7…
## $ average_score <dbl> 7.7, 7.7, 7.7, 7.7, 7.7, 7…
## $ hotel_name <chr> "Hotel Arena", "Hotel Aren…
## $ reviewer_nationality <chr> "Russia", "Ireland", "Aust…
## $ negative_review <chr> "I am so angry that i made…
## $ review_total_negative_word_counts <dbl> 397, 0, 42, 210, 140, 17, …
## $ total_number_of_reviews <dbl> 1403, 1403, 1403, 1403, 14…
## $ positive_review <chr> "Only the park outside of …
## $ review_total_positive_word_counts <dbl> 11, 105, 21, 26, 8, 20, 18…
## $ total_number_of_reviews_reviewer_has_given <dbl> 7, 7, 9, 1, 3, 1, 6, 1, 3,…
## $ reviewer_score <dbl> 2.9, 7.5, 7.1, 3.8, 6.7, 6…
## $ tags <chr> "[' Leisure trip ', ' Coup…
## $ days_since_review <chr> "0 days", "0 days", "3 day…
## $ lat <dbl> 52.4, 52.4, 52.4, 52.4, 52…
## $ lng <dbl> 4.92, 4.92, 4.92, 4.92, 4.…
hotels_date_correct<-hotels_raw %>% mutate(review_date=as.Date(review_date,"%d/%m/%Y")) %>% mutate(country=word(hotel_address,-1,sep=fixed(" ")))
#glimpse(hotels_date_correct)
#Remove empty columns and rows
hotel1<-remove_empty(hotels_date_correct, which = c("rows","cols"))%>%
filter(!is.na(lat))
#Check for duplicates (visualize bots)
duplicates<-hotel1%>%get_dupes(positive_review, negative_review, hotel_name, reviewer_nationality)
#duplicates
#delete duplicates
hotel2<-hotel1%>%
distinct(positive_review, negative_review, hotel_name, reviewer_nationality, .keep_all = TRUE)
md.pattern(hotel2,rotate.names = T)

## hotel_address additional_number_of_scoring average_score hotel_name
## 201588 1 1 1 1
## 307726 1 1 1 1
## 374 1 1 1 1
## 549 1 1 1 1
## 204 1 1 1 1
## 315 1 1 1 1
## 50 1 1 1 1
## 76 1 1 1 1
## 24 1 1 1 1
## 39 1 1 1 1
## 0 0 0 0
## review_total_negative_word_counts total_number_of_reviews
## 201588 1 1
## 307726 1 1
## 374 1 1
## 549 1 1
## 204 1 1
## 315 1 1
## 50 1 1
## 76 1 1
## 24 1 1
## 39 1 1
## 0 0
## review_total_positive_word_counts
## 201588 1
## 307726 1
## 374 1
## 549 1
## 204 1
## 315 1
## 50 1
## 76 1
## 24 1
## 39 1
## 0
## total_number_of_reviews_reviewer_has_given reviewer_score tags
## 201588 1 1 1
## 307726 1 1 1
## 374 1 1 1
## 549 1 1 1
## 204 1 1 1
## 315 1 1 1
## 50 1 1 1
## 76 1 1 1
## 24 1 1 1
## 39 1 1 1
## 0 0 0
## days_since_review lat lng country positive_review reviewer_nationality
## 201588 1 1 1 1 1 1
## 307726 1 1 1 1 1 1
## 374 1 1 1 1 1 1
## 549 1 1 1 1 1 1
## 204 1 1 1 1 1 0
## 315 1 1 1 1 1 0
## 50 1 1 1 1 0 1
## 76 1 1 1 1 0 1
## 24 1 1 1 1 0 1
## 39 1 1 1 1 0 1
## 0 0 0 0 189 519
## negative_review review_date
## 201588 1 1 0
## 307726 1 0 1
## 374 0 1 1
## 549 0 0 2
## 204 1 1 1
## 315 1 0 2
## 50 1 1 1
## 76 1 0 2
## 24 0 1 2
## 39 0 0 3
## 986 308705 310399
Europe vs. different continents - Luxury Chains, Upper Upscale Chains
Now I am good to go.
First analysis:
# distribution of all the variables chain scale
data.frame(table(global_hotel_chain_size_clean$chain_scale))
## Var1 Freq
## 1 Economy Chains 141
## 2 Luxury Chains 115
## 3 Midscale Chains 174
## 4 Upper Midscale Chains 225
## 5 Upper Upscale Chains 161
## 6 Upscale Chains 276
# changing variables to factors
global_hotel_chain_size_clean$chain_scale <- global_hotel_chain_size_clean$chain_scale %>% factor(levels= c("Economy Chains", "Midscale Chains", "Upper Midscale Chains","Upscale Chains", "Upper Upscale Chains", "Luxury Chains" ))
# checking if the data is correct
data.frame(table(global_hotel_chain_size_clean$chain_scale))
## Var1 Freq
## 1 Economy Chains 141
## 2 Midscale Chains 174
## 3 Upper Midscale Chains 225
## 4 Upscale Chains 276
## 5 Upper Upscale Chains 161
## 6 Luxury Chains 115
# pivoting the data & cleaning
chain_size_pivot<-global_hotel_chain_size_clean %>%
pivot_longer(cols = 5:11, names_to= "region", values_to= "x", values_drop_na = TRUE) %>%
mutate( region = case_when(
region== "africa" ~ "Africa",
region=="apac" ~ "APAC countries",
region== "c_s_america" ~ "Central & South America",
region== "europe" ~ "Europe",
region== "middle_east" ~ "Middle East",
region== "n_america_excl_us" ~ "North America exlc. US",
region== "united_states" ~ "United States",
)) %>%
select(-x)
# Luxury Chains across countries
chain_size_pivot_relevant<- chain_size_pivot %>%
filter(chain_scale== "Luxury Chains"|| chain_scale== "Upper Upscale Chains" || chain_scale== "Upscale Chains" ) %>%
group_by( region) %>%
summarise(count=n())
# Upper Upscale Chains across countires
chain_size_pivot %>%
filter(chain_scale== "Upper Upscale Chains") %>%
group_by( region) %>%
summarise(count=n())
## # A tibble: 7 x 2
## region count
## <chr> <int>
## 1 Africa 33
## 2 APAC countries 78
## 3 Central & South America 29
## 4 Europe 95
## 5 Middle East 34
## 6 North America exlc. US 30
## 7 United States 57
#Upscale Chains across countires
chain_size_pivot %>%
filter(chain_scale== "Upscale Chains") %>%
group_by( region) %>%
summarise(count=n())
## # A tibble: 7 x 2
## region count
## <chr> <int>
## 1 Africa 33
## 2 APAC countries 110
## 3 Central & South America 40
## 4 Europe 153
## 5 Middle East 37
## 6 North America exlc. US 43
## 7 United States 56
# charts showing how big Europe is in these sectors
my_colours <- c("grey70", "#2FABE1")
is_europe<- chain_size_pivot_relevant%>%
mutate(
is_europe = ifelse(region == "Europe", TRUE, FALSE))
# Chart of the data across all the regions
first_plot<- ggplot(is_europe, aes(x=reorder(region,-count), y=count, fill=is_europe)) +
geom_bar(stat="identity", alpha=0.8)+
theme_minimal() +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "#043680",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#043680", face="bold", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = " Europe is the most aboundant region \nin Luxury & Upper Upscale Chains", subtitle= "Number of Luxury, Upper Upscale & Upscale Chains across the regions", x="Region", y=" Number of chians", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_y_continuous()+
scale_fill_manual(values = my_colours)+
geom_label(aes(label=count),family = "Montserrat", fontface="bold", color="grey20", )
first_plot

European landscape in terms of chain types
# data on how in Europe the chains are distributed
chain_types_europe<- chain_size_pivot %>%
filter(region=="Europe") %>%
group_by( chain_scale) %>%
summarise(count=n())
my_colours2 <- c("grey70", "#043680")
is_chain<- chain_types_europe%>%
mutate(
is_chain = ifelse(chain_scale == "Luxury Chains", TRUE,
ifelse(chain_scale== "Upper Upscale Chains", TRUE, FALSE)))
# Chart of the data across all the regions
my_text <- "Upper Upscale + Luxury = 162"
ggplot(is_chain, aes(x=reorder(chain_scale,chain_scale), y=count, fill=is_chain)) +
geom_bar(stat="identity", alpha=0.8)+
theme_minimal() +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "#2FABE1",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#2FABE1", face="bold", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "Europe seems to be abundant in the chains of our aim ", subtitle= "Number of all the types of chains in Europe", x="Hotel sector", y=" Number of chians", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_y_continuous()+
scale_fill_manual(values = my_colours2)+
geom_label(aes(label=count),family = "Montserrat", fontface="bold", color="white", )+
annotate(geom= "text", x=5.5, y=110, label=my_text, family="Montserrat",size=5, color="#2FABE1", fontface="bold")

Average rating of hotels by country
#7.hotels with highest ratings (possible: combined with location)
#average rating of hotels by country
hotel3<-hotel2 %>%
distinct(hotel_name,average_score,country)%>%
select(country,average_score)
my_colours3 <- c("white", "#2FABE1")
is_france<- hotel3%>%
mutate(
is_france = ifelse(country == "France", TRUE, FALSE)) %>%
mutate(country=recode(country, "Kingdom" = "United Kingdom"))
ggplot(is_france, aes(x=average_score,y=reorder(country,average_score),fill=is_france))+
geom_violin()+
geom_boxplot(width=0.1)+
theme_minimal() +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "black",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#043680", face="bold", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "plain"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "plain"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "France has the highest median rating", subtitle= "Distribution of average ratings of hotels across 6 selected countries", x="Average rating of hotels", y="Country", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_fill_manual(values = my_colours3)

Hotels of excellent ratings, by country
##hotels with highest ratings
##choose all hotels > 8/8.5/9
hotel4<-hotel3%>%
filter(average_score>=9)%>%
group_by(country)%>%
summarise(count=n())%>%
mutate(is_best=ifelse(country=="Kingdom"|country=="France",1,0))
ggplot(hotel4,aes(x=reorder(country,count),y=count,fill=factor(is_best)))+
geom_col(position="dodge")+
theme_minimal() +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "black",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#043680", face="bold", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "plain"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "plain"),
axis.text.x=element_text(family="Montserrat", size=7, angle=30, hjust=1),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position="none")+
labs(title = "UK & France dominate hotels with >9 average rating", subtitle= "Number of >9 rating hotels in 6 selected countries", x="Country", y="No. of hotels with rating >9", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence")+
scale_fill_manual(values=c("grey80","#2FABE1")) +
coord_flip()

Proportion of Top 5 Reviewer Nationalities per Country
maggiehoteldata <- hotel2 %>% mutate(country=recode(
country, `Kingdom`= "United Kingdom")) %>% # change Kingdom to United Kingdom to allow matching
rename(hotelcountry = country)
# UK
## get counts and proportions of reviewer nationalities
ukdata <- maggiehoteldata %>%
filter(hotelcountry == "United Kingdom") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 261509) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "United Kingdom") %>%
slice(1:5)
# ukdata %>%
# summarise(sumcount = sum(countreviewernat))
# 261509 total reviews
# Netherlands
## get counts and proportions of reviewer nationalities
nldata <- maggiehoteldata %>%
filter(hotelcountry == "Netherlands") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 57119) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "Netherlands") %>%
slice(1:5)
# nldata %>%
# summarise(sumcount = sum(countreviewernat))
# 57119 total reviews
# Austria
## get counts and proportions of reviewer nationalities
ausdata <- maggiehoteldata %>%
filter(hotelcountry == "Austria") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 36241) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "Austria") %>%
slice(1:5)
# ausdata %>%
# summarise(sumcount = sum(countreviewernat))
# 36241 total reviews
# Spain
## get counts and proportions of reviewer nationalities
spaindata <- maggiehoteldata %>%
filter(hotelcountry == "Spain") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 59895) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "Spain") %>%
slice(1:5)
# spaindata %>%
# summarise(sumcount = sum(countreviewernat))
# 59895 total reviews
# France
## get counts and proportions of reviewer nationalities
francedata <- maggiehoteldata %>%
filter(hotelcountry == "France") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 59011 ) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "France") %>%
slice(1:5)
# francedata %>%
# summarise(sumcount = sum(countreviewernat))
# 59011 total reviews
# Italy
## get counts and proportions of reviewer nationalities
italydata <- maggiehoteldata %>%
filter(hotelcountry == "Italy") %>%
group_by(reviewer_nationality) %>%
summarise(countreviewernat = n(),
propreviewernat = countreviewernat / 37170) %>%
arrange(desc(countreviewernat)) %>%
mutate(hotelcountry = "Italy") %>%
slice(1:5)
# italydata %>%
# summarise(sumcount = sum(countreviewernat))
# 37170 total reviews
# Combine all of the above
top5reviewernat <- rbind(ukdata,
nldata,
spaindata,
francedata,
italydata,
ausdata)
countrylevels <- names(sort(tapply(top5reviewernat$propreviewernat, # create levels to have stacked bar chart according to size not alphabetical order
top5reviewernat$reviewer_nationality,
sum)))
# Code for barplot
stackedbarplot <- ggplot(top5reviewernat, aes(x = hotelcountry, y = propreviewernat,
fill = factor(reviewer_nationality, levels = countrylevels))) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#810f7c", "#8856a7", "#8c96c6", "#9ebcda", "#bfd3e6", "#edf8fb",
"#ccebc5", "#a8ddb5", "#7bccc4", "#43a2ca", "#0868ac", "darkblue")) +
labs(title = "Brits take over as Majority Reviewers",
subtitle = "Proportion of Top 5 Reviewer Nationalities per Country",
y = "Proportion of Total Reviews",
x = " ") +
theme(legend.title = element_blank(),
panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
plot.title = element_text(size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(face="plain", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7))
stackedbarplot

Region and subregion reviers’ background
Creating variables
glimpse(hotel2)
## Rows: 510,945
## Columns: 18
## $ hotel_address <chr> "s Gravesandestraat 55 Oos…
## $ additional_number_of_scoring <dbl> 194, 194, 194, 194, 194, 1…
## $ review_date <date> 2017-03-08, 2017-03-08, N…
## $ average_score <dbl> 7.7, 7.7, 7.7, 7.7, 7.7, 7…
## $ hotel_name <chr> "Hotel Arena", "Hotel Aren…
## $ reviewer_nationality <chr> "Russia", "Ireland", "Aust…
## $ negative_review <chr> "I am so angry that i made…
## $ review_total_negative_word_counts <dbl> 397, 0, 42, 210, 140, 17, …
## $ total_number_of_reviews <dbl> 1403, 1403, 1403, 1403, 14…
## $ positive_review <chr> "Only the park outside of …
## $ review_total_positive_word_counts <dbl> 11, 105, 21, 26, 8, 20, 18…
## $ total_number_of_reviews_reviewer_has_given <dbl> 7, 7, 9, 1, 3, 1, 6, 1, 3,…
## $ reviewer_score <dbl> 2.9, 7.5, 7.1, 3.8, 6.7, 6…
## $ tags <chr> "[' Leisure trip ', ' Coup…
## $ days_since_review <chr> "0 days", "0 days", "3 day…
## $ lat <dbl> 52.4, 52.4, 52.4, 52.4, 52…
## $ lng <dbl> 4.92, 4.92, 4.92, 4.92, 4.…
## $ country <chr> "Netherlands", "Netherland…
hotel_review_nationality<- hotel2 %>% #merging main table with countries and continents of reviwers' origin
left_join(country_continent, by=c("reviewer_nationality"="name")) %>%
drop_na(reviewer_nationality, region)
reviews_by_region<- hotel_review_nationality %>% #creating summary for regions
group_by(region) %>%
summarise(avg_review= mean(reviewer_score),
avg_no_positive= mean(review_total_positive_word_counts),
avg_no_negative= mean(review_total_negative_word_counts)) %>%
arrange(desc(avg_review))
reviews_by_subregion<- hotel_review_nationality %>% #creating summary for subregions
group_by(region,sub_region) %>%
summarise(avg_review= mean(reviewer_score),
avg_no_positive= mean(review_total_positive_word_counts),
avg_no_negative= mean(review_total_negative_word_counts)) %>%
arrange(desc(avg_review))
Reviewers’ origin by region
# pivoting regions
reviews_by_region_pivot<- reviews_by_region %>%
pivot_longer(cols = 3:4, names_to= "type_review", values_to= "score", values_drop_na = TRUE) # pivoted dataframe
reviews_by_region_pivot$type_review<- reviews_by_region_pivot$type_review %>% factor(levels= c("avg_no_negative", "avg_no_positive")) # creating avg no of positive & negative words per region + average review score per region
# creating a cool chart
reviews_by_region_pivot <- reviews_by_region_pivot%>%
mutate(score_chart = ifelse(type_review == "avg_no_positive",
score,
-1*score)) %>%
mutate(is_extreme = ifelse(score_chart>20, "1",
ifelse(score_chart< -19, "2",
ifelse(score_chart>-19 & score_chart <0, "3", "4"))))
my_colours_positive<- c("#45B05F", "tomato","#FFBAAD","#A8DDB5") # scale for the chart
my_text1r <- "Negative"
my_text2r <- "Positive"
my_text3r <- "Average review score"
# negative and positive
reviews_by_region_pivot %>%
ggplot(aes(x = reorder(region,avg_review), fill=is_extreme))+
geom_bar(aes(y = score_chart),stat = "identity")+
coord_flip() +
theme_minimal()+
geom_text(aes(y=avg_review, label=round(avg_review,1)),family = "Montserrat", fontface="bold", colour="black")+
scale_y_continuous(breaks=seq(-20,20,5)) +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "grey20",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "grey40", face="plain",size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic",size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "The most criticising guests come from Europe,\nthe most positive come from Australia", subtitle= "Number of positive and negative words per review per continent and average review score", x= "Region", y= "Average number of positive/negative words in a review", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_fill_manual(values = my_colours_positive) +
scale_y_continuous(labels=abs)+
geom_hline(yintercept = -18, linetype="dashed",
color = "tomato", size=0.8)+
geom_hline(yintercept = 19.5, linetype="dashed",
color = "#45B05F", size=0.8)+
annotate(geom= "text", x=5, y=-2, label=my_text1r, family="Montserrat",size=6, color="tomato", fontface="bold")+
annotate(geom= "text", x=5, y=1.9, label=my_text2r, family="Montserrat",size=6, color="#F1FFEB", fontface=2)+
annotate(geom= "text", x=5.5, y=9, label=my_text3r, family="Montserrat",size=4, color="black", fontface=2)

Reviewers’ origin by Subregion
reviews_by_subregion_pivot<- reviews_by_subregion %>%
pivot_longer(cols = 4:5, names_to= "type_review", values_to= "score", values_drop_na = TRUE) # pivoted dataframe
reviews_by_subregion_pivot$type_review<- reviews_by_subregion_pivot$type_review %>% factor(levels= c("avg_no_negative", "avg_no_positive")) # creating avg no of positive & negative words per region + average review score per subregion
reviews_by_subregion_pivot <- reviews_by_subregion_pivot%>%
mutate(score_chart = ifelse(type_review == "avg_no_positive",
score,
-1*score)) %>%
mutate(is_extreme = ifelse(score_chart>17.5, "1",
ifelse(score_chart< -17.5, "2",
ifelse(score_chart>-17.5 & score_chart <0, "3", "4"))))
my_text1s <- "Negative"
my_text2s <- "Positive"
my_text3s <- "Average \nreview score"
reviews_by_subregion_pivot %>%
ggplot(aes(x = reorder(sub_region,avg_review), fill = is_extreme))+
geom_bar(aes(y = score_chart),stat = "identity")+
coord_flip() +
geom_text(aes(y=avg_review, label=round(avg_review,1)),family = "Montserrat", fontface="bold", colour="black")+
scale_y_continuous(breaks=seq(-20,20,5)) +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "grey20",size=10,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "grey40", face="plain", ,size= 8,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "Guests from Europe & North America seem to be \nmost influencial ones and shaping the opinion", subtitle= "Average number of positive/negative words in a review and average review score", x= "Region", y= "Average number of positive/negative words in a review", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_fill_manual(values = my_colours_positive) +
scale_y_continuous(labels=abs)+
geom_hline(yintercept = -17.5, linetype="dashed",
color = "tomato", size=0.8)+
geom_hline(yintercept = 17.5, linetype="dashed",
color = "#45B05F", size=0.8)+
annotate(geom= "text", x=15, y=-2, label=my_text1s, family="Montserrat",size=4.5, color="tomato", fontface="bold")+
annotate(geom= "text", x=15, y=1.9, label=my_text2s, family="Montserrat",size=4.5, color="#F1FFEB", fontface=2)+
annotate(geom= "text", x=17, y=12.5, label=my_text3s, family="Montserrat",size=3.5, color="black", fontface=2)

Creating wordclouds with most popular positive/ negative words
Preparing dataset
hotelcloud<-hotel2[!is.na(hotel2$positive_review), ]
hotelcloud<-hotel2[!is.na(hotel2$negative_review), ]
hotelcloud2<-hotelcloud
#clean reviews and select only those with useful information
hotelcloud3<-hotelcloud2 %>%
filter(sapply(strsplit(positive_review, " "), length)>3) %>%
filter(sapply(strsplit(negative_review, " "), length)>3)
#randomly select 6000 data to visualize
hotelcloud4<-hotelcloud3[sample(nrow(hotelcloud3), 6000), ]
rm(hotelcloud3,hotelcloud2)
#build corpus
library(tm)
review_pos <- Corpus(VectorSource(hotelcloud4$positive_review))
review_neg <- Corpus(VectorSource(hotelcloud4$negative_review))
#skip the words: room and hotel, cuz they dont give useful information
words <- c("room","hotel")
review_pos <- tm_map(review_pos, removeWords, words)
review_neg <- tm_map(review_neg, removeWords, words)
#count frequency
DTM_pos <- DocumentTermMatrix(review_pos, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stripWhitespace = TRUE))
dim(DTM_pos)
## [1] 6000 5660
DTM_neg <- DocumentTermMatrix(review_neg, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stripWhitespace = TRUE))
inspect(DTM_neg)
## <<DocumentTermMatrix (documents: 6000, terms: 7568)>>
## Non-/sparse entries: 73268/45334732
## Sparsity : 100%
## Maximal term length: 18
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bathroom bed bit breakfast didn little one rooms small staff
## 2515 2 1 0 0 0 0 1 0 0 0
## 2620 1 1 0 1 0 1 2 0 0 0
## 292 0 0 0 0 3 0 0 2 0 2
## 3331 0 1 0 3 0 0 1 0 0 0
## 3333 1 4 1 0 2 1 0 0 0 3
## 4033 0 0 1 0 0 0 0 0 0 0
## 4332 0 0 0 0 0 0 2 4 1 0
## 4603 3 0 0 0 2 0 2 0 0 1
## 4721 0 0 1 4 6 0 0 0 0 1
## 5519 0 0 0 1 1 1 0 0 0 2
rm(review_neg,review_pos)
Positive wordcloud
##Positive wordcloud
#convert to tibble
m <- as.matrix(DTM_pos)
DTM_tbl <- as_tibble(m)
#rm (m,hotel3,DTM_neg,DTM_pos,review_pos,review_neg,DTM_tbl,wordCountDoc_pos)
DTM_pos_tidy <- pivot_longer(DTM_tbl, cols = everything(), names_to = "word", values_to = "wordCount")
# Order by freq
wordCountDoc_pos <- DTM_pos_tidy %>%
group_by(word) %>%
summarise(total_pos = sum(wordCount)) %>%
arrange(desc(total_pos))
print(wordCountDoc_pos %>% top_n(5))
## # A tibble: 5 x 2
## word total_pos
## <chr> <dbl>
## 1 staff 2585
## 2 location 2501
## 3 good 1770
## 4 great 1418
## 5 friendly 1146
#wordcloud
library(wordcloud)
library(wordcloud2)
wordcloud(words = wordCountDoc_pos$word,
freq = wordCountDoc_pos$total_pos,
max.words = 150,
scale = c(3, 0.5),
random.order = FALSE,
rot.per = 0.35,
colors = c("#ccebc5", "#7bccc4", "#7fbf7b","#4eb3d3","#8c6bb1","#810f7c"))+
title(main = "Most Frequent Words in Positive Reviews", col.main = "#810f7c", size = 8, family="Montserrat", face = "bold")

## integer(0)
Negatvie wordcloud
##Negatvie wordcloud
#convert to tibble
m <- as.matrix(DTM_neg)
DTM_tbl <- as_tibble(m)
#rm (m,hotel3,DTM_neg,DTM_pos,review_pos,review_neg,DTM_tbl,wordCountDoc_pos)
DTM_neg_tidy <- pivot_longer(DTM_tbl, cols = everything(), names_to = "word", values_to = "wordCount")
# Order by freq
wordCountDoc_neg <- DTM_neg_tidy %>%
group_by(word) %>%
summarise(total_neg = sum(wordCount)) %>%
arrange(desc(total_neg))
print(wordCountDoc_neg %>% top_n(5))
## # A tibble: 5 x 2
## word total_neg
## <chr> <dbl>
## 1 breakfast 1077
## 2 small 870
## 3 staff 700
## 4 rooms 596
## 5 bit 578
#wordcloud
wordcloud(words = wordCountDoc_neg$word,
freq = wordCountDoc_neg$total_neg,
max.words = 150,
min.freq = 140,
scale = c(3, 0.5),
random.order = FALSE,
rot.per = 0.35,
colors = c("#ccebc5", "#7bccc4", "#7fbf7b","#4eb3d3","#8c6bb1","#810f7c"))+
title(main = "Most Frequent Words in Negative Reviews", col.main = "#810f7c", size = 8, family="Montserrat", face = "bold")

## integer(0)
Hotel reviewed by highest number of alleged bots
bots_by_hotel<-duplicates%>%
group_by(hotel_name)%>%
summarise(count=n())%>%
mutate(is_culprit=ifelse(hotel_name=="H tel Concorde Montparnasse",1,0))%>% #for graph fill aesthetic
arrange(desc(count))%>%
slice(1:7) #selecting top 7 hotels for visualisation, so we arrange by desc(count) and slice the first 7
ggplot(bots_by_hotel, aes(x=reorder(hotel_name,count), y=count, fill=factor(is_culprit))) +
geom_bar(stat="identity", alpha=0.8)+
coord_flip()+ #flip axes to show name of hotel
theme_minimal() + #put simple theme to emnphasise the graphs
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "#043680",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#043680", face="plain", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "plain"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "plain"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "Hotels reviewed by \nhighest number of alleged bots", subtitle= "", x="Hotels with alleged use of bots", y="Count", caption="Source: https://www.kaggle.com/ployyyywa/global-hotel-chain-presence") +
scale_y_continuous()+
scale_fill_manual(values = c("grey80", "#2FABE1"))+ #specifying the colours
geom_label(aes(label=count))

Meaningless reviews vs meaningful reviews
#popularity of bots
bots_by_popularity<-duplicates%>%
mutate(negative_review_renew=ifelse(negative_review=="No Negative"|negative_review=="Nothing"|negative_review=="nothing"|negative_review=="Everything","Meaningless review","Meaningful review"))%>% #define meaningless review and meaningful review for visualisation
group_by(negative_review_renew)%>%
summarise(sum=n())%>%
arrange(desc(sum))%>%
mutate(is_others=ifelse(negative_review_renew=="Meaningful review",1,0))%>% #deprecated line, originally used for fill aesthetic colouring, but we cancelled the colouring already
slice(1:2)
my_text1b <- "No Negative, \nNothing, \nnothing, & \nEverything"
ggplot(bots_by_popularity, aes(x=reorder(negative_review_renew,-sum), y=sum)) +
geom_bar(stat="identity", alpha=0.8)+
theme_minimal() +
theme(panel.grid.major.y = element_line(color = "gray60", size = 0.1),
panel.background = element_rect(fill = "white", colour = "white"),
axis.line = element_line(size = 1, colour = "grey80"),
axis.ticks = element_line(size = 3,colour = "grey80"),
axis.ticks.length = unit(.20, "cm"),
plot.title = element_text(color = "#043680",size=15,face="bold", family= "Montserrat"),
plot.subtitle = element_text(color = "#043680", face="plain", ,size= 10,family= "Montserrat"),
plot.caption = element_text(color = "grey40", face="italic", ,size= 7,family= "Montserrat",hjust=0),
axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "plain"),
axis.text.y=element_text(family="Montserrat", size=7),
axis.title.x = element_text(size = 8, family="Montserrat", face = "plain"),
axis.text.x=element_text(family="Montserrat", size=7),
legend.text=element_text(family="Montserrat", size=7),
legend.title=element_text(family="Montserrat", size=8, face="bold"),
legend.position = "none")+
labs(title = "Two-thirds of bots produce meaningless reviews", subtitle= "", x="Type of review", y="Count", caption="(We define repetitive reviews as exactly one of 'No Negative', 'Nothing', 'nothing' and 'Everything')") +
scale_y_continuous()+
scale_fill_manual(values = c("grey80"))+
geom_label(aes(label=sum),family="Montserrat", fontface="bold" )#+

#annotate(geom= "text", x=1.7, y=1500, label=my_text1b, family="Montserrat",size=7, color="#2FABE1",fontface="bold")
Thank you!