Beer Ratings


Load libraries

library(descr)
library(plyr)

Processing

Read the data in and save the raw data as an RDA file. Create smaller sample.

wd <- getwd()
# beerReviews <- read.csv('../../data/beer_reviews.csv')
breweries <- read.csv("../../data/breweries.csv")
# save(beerReviews,file='../../data/beer_reviews.rda')
save(breweries, file = "../../data/breweries.rda")

Current working directory id “/home/jakub/R/beer/code/rewcode”.

Subset review dataset

# set.seed(1) subset<-beerReviews[sample(1:nrow(beerReviews),
# size=200000),] save(subset, file='../../data/reviews_subset.rda')
# write.csv(subset, file='../../data/reviews_subset.csv')
subset <- read.csv("../../data/reviews_subset.csv")

Look at the data set

Reviews

dim(subset)
## [1] 200000     14
head(subset)
##         X brewery_id                       brewery_name review_time
## 1  421260         73        Great Lakes Brewing Company  1313022603
## 2  590417       1549              Goose Island Beer Co.  1274563790
## 3  908897        142           Spaten-Franziskaner-Bräu  1149727060
## 4 1440973         63                  Deschutes Brewery  1237525967
## 5  319991         35 Boston Beer Company (Samuel Adams)  1274749248
## 6 1425394        385           Midnight Sun Brewing Co.  1301966988
##   review_overall review_aroma review_appearance review_profilename
## 1            4.0          3.5               4.5             Fuzzy1
## 2            4.5          4.5               5.0         cpetrone84
## 3            4.0          3.0               4.0           jmz62565
## 4            4.0          4.5               3.0            sevopie
## 5            4.5          4.0               4.5             klewis
## 6            4.5          4.5               4.0             glid02
##                         beer_style review_palate review_taste
## 1          American Pale Ale (APA)           4.0          4.0
## 2 American Double / Imperial Stout           4.5          4.5
## 3                       Doppelbock           4.0          3.5
## 4                  American Porter           3.5          3.5
## 5              American Barleywine           4.5          4.0
## 6               English Barleywine           4.5          5.0
##                            beer_name beer_abv beer_beerid
## 1 Great Lakes Burning River Pale Ale      6.0         225
## 2  Bourbon County Brand Coffee Stout     13.0       57747
## 3                   Spaten Optimator      7.6          99
## 4                 Black Butte Porter      5.2         198
## 5 LongShot Mile High Barley Wine Ale      9.8       57272
## 6           Arctic Devil Barley Wine     13.2       18093
summary(subset)
##        X             brewery_id   
##  Min.   :      4   Min.   :    1  
##  1st Qu.: 396209   1st Qu.:  143  
##  Median : 794920   Median :  423  
##  Mean   : 793427   Mean   : 3115  
##  3rd Qu.:1190326   3rd Qu.: 2372  
##  Max.   :1586610   Max.   :28003  
##                                   
##                              brewery_name     review_time      
##  Boston Beer Company (Samuel Adams):  4883   Min.   :8.85e+08  
##  Stone Brewing Co.                 :  4287   1st Qu.:1.17e+09  
##  Dogfish Head Brewery              :  4241   Median :1.24e+09  
##  Sierra Nevada Brewing Co.         :  3596   Mean   :1.22e+09  
##  Bell's Brewery, Inc.              :  3194   3rd Qu.:1.29e+09  
##  Rogue Ales                        :  2938   Max.   :1.33e+09  
##  (Other)                           :176861                     
##  review_overall  review_aroma  review_appearance      review_profilename
##  Min.   :0.00   Min.   :1.00   Min.   :0.00      northyorksammy:   716  
##  1st Qu.:3.50   1st Qu.:3.50   1st Qu.:3.50      mikesgroove   :   586  
##  Median :4.00   Median :4.00   Median :4.00      BuckeyeNation :   546  
##  Mean   :3.82   Mean   :3.74   Mean   :3.84      NeroFiddled   :   441  
##  3rd Qu.:4.50   3rd Qu.:4.00   3rd Qu.:4.00      womencantsail :   428  
##  Max.   :5.00   Max.   :5.00   Max.   :5.00      Thorpe429     :   425  
##                                                  (Other)       :196858  
##                             beer_style     review_palate   review_taste 
##  American IPA                    : 14726   Min.   :1.00   Min.   :1.00  
##  American Double / Imperial IPA  : 10903   1st Qu.:3.50   1st Qu.:3.50  
##  American Pale Ale (APA)         :  8198   Median :4.00   Median :4.00  
##  Russian Imperial Stout          :  6786   Mean   :3.75   Mean   :3.79  
##  American Porter                 :  6384   3rd Qu.:4.00   3rd Qu.:4.50  
##  American Double / Imperial Stout:  6356   Max.   :5.00   Max.   :5.00  
##  (Other)                         :146647                                
##                                beer_name         beer_abv      
##  90 Minute IPA                      :   413   Min.   :   0.05  
##  Stone Ruination IPA                :   404   1st Qu.:   5.20  
##  India Pale Ale                     :   387   Median :   6.50  
##  Old Rasputin Russian Imperial Stout:   387   Mean   :   7.05  
##  Sierra Nevada Celebration Ale      :   366   3rd Qu.:   8.50  
##  Sierra Nevada Pale Ale             :   349   Max.   :  41.00  
##  (Other)                            :197694   NA's   :8496.00  
##   beer_beerid   
##  Min.   :    3  
##  1st Qu.: 1717  
##  Median :13906  
##  Mean   :21714  
##  3rd Qu.:39441  
##  Max.   :77317  
## 
sapply(subset[1, ], class)
##                  X         brewery_id       brewery_name 
##          "integer"          "integer"           "factor" 
##        review_time     review_overall       review_aroma 
##          "integer"          "numeric"          "numeric" 
##  review_appearance review_profilename         beer_style 
##          "numeric"           "factor"           "factor" 
##      review_palate       review_taste          beer_name 
##          "numeric"          "numeric"           "factor" 
##           beer_abv        beer_beerid 
##          "numeric"          "integer"

Breweries

dim(breweries)
## [1] 4567    2
head(breweries)
##      id country
## 1 18115      RO
## 2  2186      DE
## 3 16792      US
## 4 11618      US
## 5 10541      MX
## 6 15756      DE
summary(breweries)
##        id           country    
##  Min.   :    1   US     :1750  
##  1st Qu.: 2552   DE     : 603  
##  Median : 8590   UK     : 284  
##  Mean   :10388   CA     : 198  
##  3rd Qu.:17092   AU     : 183  
##  Max.   :28003   (Other):1547  
##                  NA's   :   2
sapply(breweries[1, ], class)
##        id   country 
## "integer"  "factor"

Merge datasets


data <- merge(subset, breweries, by.x = "brewery_id", by.y = "id", all = TRUE)

Look at the merged data

dim(data)
## [1] 200959     15
head(data)
##   brewery_id     X             brewery_name review_time review_overall
## 1          1 46773 Plzensky Prazdroj, a. s.  1111822280            2.5
## 2          1 46100 Plzensky Prazdroj, a. s.  1246908992            3.0
## 3          1 46422 Plzensky Prazdroj, a. s.  1191757110            3.5
## 4          1 46728 Plzensky Prazdroj, a. s.  1125889212            3.0
## 5          1 46068 Plzensky Prazdroj, a. s.  1252816687            5.0
## 6          1 46263 Plzensky Prazdroj, a. s.  1218368958            4.5
##   review_aroma review_appearance review_profilename     beer_style
## 1          2.0               4.0           hefevice Czech Pilsener
## 2          3.5               3.5    civilizedpsycho Czech Pilsener
## 3          3.5               3.0             armock Czech Pilsener
## 4          2.0               3.0             BigBry Czech Pilsener
## 5          4.0               4.0             jsh420 Czech Pilsener
## 6          4.0               4.5       TurdFurgison Czech Pilsener
##   review_palate review_taste       beer_name beer_abv beer_beerid country
## 1           2.5          2.5 Pilsner Urquell      4.4         429      CZ
## 2           2.5          3.0 Pilsner Urquell      4.4         429      CZ
## 3           3.5          3.5 Pilsner Urquell      4.4         429      CZ
## 4           3.0          3.5 Pilsner Urquell      4.4         429      CZ
## 5           4.0          4.0 Pilsner Urquell      4.4         429      CZ
## 6           4.0          4.0 Pilsner Urquell      4.4         429      CZ
summary(data)
##    brewery_id          X          
##  Min.   :    1   Min.   :      4  
##  1st Qu.:  144   1st Qu.: 396209  
##  Median :  429   Median : 794920  
##  Mean   : 3169   Mean   : 793427  
##  3rd Qu.: 2431   3rd Qu.:1190326  
##  Max.   :28003   Max.   :1586610  
##                  NA's   :    959  
##                              brewery_name     review_time      
##  Boston Beer Company (Samuel Adams):  4883   Min.   :8.85e+08  
##  Stone Brewing Co.                 :  4287   1st Qu.:1.17e+09  
##  Dogfish Head Brewery              :  4241   Median :1.24e+09  
##  Sierra Nevada Brewing Co.         :  3596   Mean   :1.22e+09  
##  Bell's Brewery, Inc.              :  3194   3rd Qu.:1.29e+09  
##  (Other)                           :179799   Max.   :1.33e+09  
##  NA's                              :   959   NA's   :9.59e+02  
##  review_overall    review_aroma    review_appearance
##  Min.   :  0.00   Min.   :  1.00   Min.   :  0.00   
##  1st Qu.:  3.50   1st Qu.:  3.50   1st Qu.:  3.50   
##  Median :  4.00   Median :  4.00   Median :  4.00   
##  Mean   :  3.82   Mean   :  3.74   Mean   :  3.84   
##  3rd Qu.:  4.50   3rd Qu.:  4.00   3rd Qu.:  4.00   
##  Max.   :  5.00   Max.   :  5.00   Max.   :  5.00   
##  NA's   :959.00   NA's   :959.00   NA's   :959.00   
##       review_profilename                          beer_style    
##  northyorksammy:   716   American IPA                  : 14726  
##  mikesgroove   :   586   American Double / Imperial IPA: 10903  
##  BuckeyeNation :   546   American Pale Ale (APA)       :  8198  
##  NeroFiddled   :   441   Russian Imperial Stout        :  6786  
##  womencantsail :   428   American Porter               :  6384  
##  (Other)       :197283   (Other)                       :153003  
##  NA's          :   959   NA's                          :   959  
##  review_palate     review_taste   
##  Min.   :  1.00   Min.   :  1.00  
##  1st Qu.:  3.50   1st Qu.:  3.50  
##  Median :  4.00   Median :  4.00  
##  Mean   :  3.75   Mean   :  3.79  
##  3rd Qu.:  4.00   3rd Qu.:  4.50  
##  Max.   :  5.00   Max.   :  5.00  
##  NA's   :959.00   NA's   :959.00  
##                                beer_name         beer_abv      
##  90 Minute IPA                      :   413   Min.   :   0.05  
##  Stone Ruination IPA                :   404   1st Qu.:   5.20  
##  India Pale Ale                     :   387   Median :   6.50  
##  Old Rasputin Russian Imperial Stout:   387   Mean   :   7.05  
##  Sierra Nevada Celebration Ale      :   366   3rd Qu.:   8.50  
##  (Other)                            :198043   Max.   :  41.00  
##  NA's                               :   959   NA's   :9455.00  
##   beer_beerid       country      
##  Min.   :    3   US     :135219  
##  1st Qu.: 1717   BE     : 15026  
##  Median :13906   DE     : 12653  
##  Mean   :21714   UK     :  9054  
##  3rd Qu.:39441   CA     :  3970  
##  Max.   :77317   (Other): 21279  
##  NA's   :  959   NA's   :  3758
sapply(data[1, ], class)
##         brewery_id                  X       brewery_name 
##          "integer"          "integer"           "factor" 
##        review_time     review_overall       review_aroma 
##          "integer"          "numeric"          "numeric" 
##  review_appearance review_profilename         beer_style 
##          "numeric"           "factor"           "factor" 
##      review_palate       review_taste          beer_name 
##          "numeric"          "numeric"           "factor" 
##           beer_abv        beer_beerid            country 
##          "numeric"          "integer"           "factor"

Find out about missing values

sum(is.na(data))
## [1] 24721

Exploratory analysis

Analyze data

table(data$country)
## 
##     AL     AM     AR     AT     AU     AW     AZ     BA     BB     BE 
##      5     13     74    531   1352      8      1      9      9  15026 
##     BG     BM     BO     BR     BS     BT     BY     BZ     CA     CH 
##     28      4     10    243     25      1      5     10   3970    179 
##     CI     CL     CN     CO     CR     CU     CV     CY     CZ     DE 
##      1      8    153    549     26      7      1     10    768  12653 
##     DK     DM     DO     DZ     EC     EE     EG     ES     ET     FI 
##   1600      2     31      1      7     35      3    170     22    128 
##     FO     FR     GA     GE     GG     GH     GL     GP     GR     GT 
##      1    820      8      7      1      3      2      1     69     27 
##     GU     HK     HN     HR     HT     HU     ID     IE     IL     IN 
##      1      2      8     32      6     21      6   1369   1818    131 
##     IS     IT     JM     JP     KE     KH     KR     KY     KZ     LA 
##     25    642    165    880     40      2     34      7     12     10 
##     LB     LC     LK     LT     LU     LV     MA     MC     MD     ME 
##     21      7    127    129     11     37   1129      1      2      7 
##     MG     MI     MK     MM     MN     MO     MQ     MT     MU     MX 
##      1      3      2      3      7      3      1     16      1    852 
##     MY     NG     NI     NL     NO     NP     NY     NZ     OR     PA 
##      1      6     15   1290    568      1      1    322      5      7 
##     PE     PF     PG     PH     PK     PL     PR     PS     PT     QC 
##     32      8      1     71      1    461      8      1     42   2434 
##     RO     RS     RU     SE     SG     SI     SK     SM     SN     SV 
##     26      7    431    409     63     15     37      1      1     48 
##     TC     TH     TM     TN     TR     TT     TW     TZ     UA     UG 
##      1     99      1      1     43     41      9      2     88      1 
##     UK     US     UY     UZ     VE     VI     VN     VU     WA     WS 
##   9054 135219      5      3      7     23     36      1     33      1 
##     ZA     ZW 
##     30      2
freq(data$country, cex.names = 0.35)

plot of chunk unnamed-chunk-6

## data$country 
##       Frequency   Percent Valid Percent
## AL            5 2.488e-03     2.535e-03
## AM           13 6.469e-03     6.592e-03
## AR           74 3.682e-02     3.753e-02
## AT          531 2.642e-01     2.693e-01
## AU         1352 6.728e-01     6.856e-01
## AW            8 3.981e-03     4.057e-03
## AZ            1 4.976e-04     5.071e-04
## BA            9 4.479e-03     4.564e-03
## BB            9 4.479e-03     4.564e-03
## BE        15026 7.477e+00     7.620e+00
## BG           28 1.393e-02     1.420e-02
## BM            4 1.990e-03     2.028e-03
## BO           10 4.976e-03     5.071e-03
## BR          243 1.209e-01     1.232e-01
## BS           25 1.244e-02     1.268e-02
## BT            1 4.976e-04     5.071e-04
## BY            5 2.488e-03     2.535e-03
## BZ           10 4.976e-03     5.071e-03
## CA         3970 1.976e+00     2.013e+00
## CH          179 8.907e-02     9.077e-02
## CI            1 4.976e-04     5.071e-04
## CL            8 3.981e-03     4.057e-03
## CN          153 7.613e-02     7.759e-02
## CO          549 2.732e-01     2.784e-01
## CR           26 1.294e-02     1.318e-02
## CU            7 3.483e-03     3.550e-03
## CV            1 4.976e-04     5.071e-04
## CY           10 4.976e-03     5.071e-03
## CZ          768 3.822e-01     3.895e-01
## DE        12653 6.296e+00     6.416e+00
## DK         1600 7.962e-01     8.114e-01
## DM            2 9.952e-04     1.014e-03
## DO           31 1.543e-02     1.572e-02
## DZ            1 4.976e-04     5.071e-04
## EC            7 3.483e-03     3.550e-03
## EE           35 1.742e-02     1.775e-02
## EG            3 1.493e-03     1.521e-03
## ES          170 8.459e-02     8.621e-02
## ET           22 1.095e-02     1.116e-02
## FI          128 6.369e-02     6.491e-02
## FO            1 4.976e-04     5.071e-04
## FR          820 4.080e-01     4.158e-01
## GA            8 3.981e-03     4.057e-03
## GE            7 3.483e-03     3.550e-03
## GG            1 4.976e-04     5.071e-04
## GH            3 1.493e-03     1.521e-03
## GL            2 9.952e-04     1.014e-03
## GP            1 4.976e-04     5.071e-04
## GR           69 3.434e-02     3.499e-02
## GT           27 1.344e-02     1.369e-02
## GU            1 4.976e-04     5.071e-04
## HK            2 9.952e-04     1.014e-03
## HN            8 3.981e-03     4.057e-03
## HR           32 1.592e-02     1.623e-02
## HT            6 2.986e-03     3.043e-03
## HU           21 1.045e-02     1.065e-02
## ID            6 2.986e-03     3.043e-03
## IE         1369 6.812e-01     6.942e-01
## IL         1818 9.047e-01     9.219e-01
## IN          131 6.519e-02     6.643e-02
## IS           25 1.244e-02     1.268e-02
## IT          642 3.195e-01     3.256e-01
## JM          165 8.211e-02     8.367e-02
## JP          880 4.379e-01     4.462e-01
## KE           40 1.990e-02     2.028e-02
## KH            2 9.952e-04     1.014e-03
## KR           34 1.692e-02     1.724e-02
## KY            7 3.483e-03     3.550e-03
## KZ           12 5.971e-03     6.085e-03
## LA           10 4.976e-03     5.071e-03
## LB           21 1.045e-02     1.065e-02
## LC            7 3.483e-03     3.550e-03
## LK          127 6.320e-02     6.440e-02
## LT          129 6.419e-02     6.542e-02
## LU           11 5.474e-03     5.578e-03
## LV           37 1.841e-02     1.876e-02
## MA         1129 5.618e-01     5.725e-01
## MC            1 4.976e-04     5.071e-04
## MD            2 9.952e-04     1.014e-03
## ME            7 3.483e-03     3.550e-03
## MG            1 4.976e-04     5.071e-04
## MI            3 1.493e-03     1.521e-03
## MK            2 9.952e-04     1.014e-03
## MM            3 1.493e-03     1.521e-03
## MN            7 3.483e-03     3.550e-03
## MO            3 1.493e-03     1.521e-03
## MQ            1 4.976e-04     5.071e-04
## MT           16 7.962e-03     8.114e-03
## MU            1 4.976e-04     5.071e-04
## MX          852 4.240e-01     4.320e-01
## MY            1 4.976e-04     5.071e-04
## NG            6 2.986e-03     3.043e-03
## NI           15 7.464e-03     7.606e-03
## NL         1290 6.419e-01     6.542e-01
## NO          568 2.826e-01     2.880e-01
## NP            1 4.976e-04     5.071e-04
## NY            1 4.976e-04     5.071e-04
## NZ          322 1.602e-01     1.633e-01
## OR            5 2.488e-03     2.535e-03
## PA            7 3.483e-03     3.550e-03
## PE           32 1.592e-02     1.623e-02
## PF            8 3.981e-03     4.057e-03
## PG            1 4.976e-04     5.071e-04
## PH           71 3.533e-02     3.600e-02
## PK            1 4.976e-04     5.071e-04
## PL          461 2.294e-01     2.338e-01
## PR            8 3.981e-03     4.057e-03
## PS            1 4.976e-04     5.071e-04
## PT           42 2.090e-02     2.130e-02
## QC         2434 1.211e+00     1.234e+00
## RO           26 1.294e-02     1.318e-02
## RS            7 3.483e-03     3.550e-03
## RU          431 2.145e-01     2.186e-01
## SE          409 2.035e-01     2.074e-01
## SG           63 3.135e-02     3.195e-02
## SI           15 7.464e-03     7.606e-03
## SK           37 1.841e-02     1.876e-02
## SM            1 4.976e-04     5.071e-04
## SN            1 4.976e-04     5.071e-04
## SV           48 2.389e-02     2.434e-02
## TC            1 4.976e-04     5.071e-04
## TH           99 4.926e-02     5.020e-02
## TM            1 4.976e-04     5.071e-04
## TN            1 4.976e-04     5.071e-04
## TR           43 2.140e-02     2.181e-02
## TT           41 2.040e-02     2.079e-02
## TW            9 4.479e-03     4.564e-03
## TZ            2 9.952e-04     1.014e-03
## UA           88 4.379e-02     4.462e-02
## UG            1 4.976e-04     5.071e-04
## UK         9054 4.505e+00     4.591e+00
## US       135219 6.729e+01     6.857e+01
## UY            5 2.488e-03     2.535e-03
## UZ            3 1.493e-03     1.521e-03
## VE            7 3.483e-03     3.550e-03
## VI           23 1.145e-02     1.166e-02
## VN           36 1.791e-02     1.826e-02
## VU            1 4.976e-04     5.071e-04
## WA           33 1.642e-02     1.673e-02
## WS            1 4.976e-04     5.071e-04
## ZA           30 1.493e-02     1.521e-02
## ZW            2 9.952e-04     1.014e-03
## NA's       3758 1.870e+00              
## Total    200959 1.000e+02     1.000e+02

Remove countries with too few ratings

country.freq <- table(data$country)
sort(country.freq, decreasing = TRUE)
## 
##     US     BE     DE     UK     CA     QC     IL     DK     IE     AU 
## 135219  15026  12653   9054   3970   2434   1818   1600   1369   1352 
##     NL     MA     JP     MX     FR     CZ     IT     NO     CO     AT 
##   1290   1129    880    852    820    768    642    568    549    531 
##     PL     RU     SE     NZ     BR     CH     ES     JM     CN     IN 
##    461    431    409    322    243    179    170    165    153    131 
##     LT     FI     LK     TH     UA     AR     PH     GR     SG     SV 
##    129    128    127     99     88     74     71     69     63     48 
##     TR     PT     TT     KE     LV     SK     VN     EE     KR     WA 
##     43     42     41     40     37     37     36     35     34     33 
##     HR     PE     DO     ZA     BG     GT     CR     RO     BS     IS 
##     32     32     31     30     28     27     26     26     25     25 
##     VI     ET     HU     LB     MT     NI     SI     AM     KZ     LU 
##     23     22     21     21     16     15     15     13     12     11 
##     BO     BZ     CY     LA     BA     BB     TW     AW     CL     GA 
##     10     10     10     10      9      9      9      8      8      8 
##     HN     PF     PR     CU     EC     GE     KY     LC     ME     MN 
##      8      8      8      7      7      7      7      7      7      7 
##     PA     RS     VE     HT     ID     NG     AL     BY     OR     UY 
##      7      7      7      6      6      6      5      5      5      5 
##     BM     EG     GH     MI     MM     MO     UZ     DM     GL     HK 
##      4      3      3      3      3      3      3      2      2      2 
##     KH     MD     MK     TZ     ZW     AZ     BT     CI     CV     DZ 
##      2      2      2      2      2      1      1      1      1      1 
##     FO     GG     GP     GU     MC     MG     MQ     MU     MY     NP 
##      1      1      1      1      1      1      1      1      1      1 
##     NY     PG     PK     PS     SM     SN     TC     TM     TN     UG 
##      1      1      1      1      1      1      1      1      1      1 
##     VU     WS 
##      1      1
length(country.freq)  # pocet vsetkych krajin
## [1] 142
country.freq.border <- 100

sum(country.freq > country.freq.border)  #pocet krtajin, ktore maju aspon 20 hodnoteni
## [1] 33
frequent.countries <- names(country.freq)[country.freq > country.freq.border]
nrow(data)  #pocet vsetkych hodnoteni
## [1] 200959
frequent.countries.data <- subset(data, data$country %in% frequent.countries)
frequent.countries.data$country <- factor(frequent.countries.data$country, levels = frequent.countries)
nrow(frequent.countries.data)  # pocet hodnoteni z krajin, ktore maju dost hodnoteni
## [1] 195572
freq(frequent.countries.data$country, cex.names = 0.4)

plot of chunk unnamed-chunk-8

## frequent.countries.data$country 
##       Frequency   Percent
## AT          531   0.27151
## AU         1352   0.69131
## BE        15026   7.68310
## BR          243   0.12425
## CA         3970   2.02994
## CH          179   0.09153
## CN          153   0.07823
## CO          549   0.28072
## CZ          768   0.39269
## DE        12653   6.46974
## DK         1600   0.81811
## ES          170   0.08692
## FI          128   0.06545
## FR          820   0.41928
## IE         1369   0.70000
## IL         1818   0.92958
## IN          131   0.06698
## IT          642   0.32827
## JM          165   0.08437
## JP          880   0.44996
## LK          127   0.06494
## LT          129   0.06596
## MA         1129   0.57728
## MX          852   0.43565
## NL         1290   0.65960
## NO          568   0.29043
## NZ          322   0.16465
## PL          461   0.23572
## QC         2434   1.24455
## RU          431   0.22038
## SE          409   0.20913
## UK         9054   4.62950
## US       135219  69.14027
## Total    195572 100.00000

make some boxplot

boxplot(review_overall ~ country, data = frequent.countries.data, cex.axis = 0.3)
means <- tapply(frequent.countries.data$review_overall, frequent.countries.data$country, 
    mean, na.rm = TRUE)
points(means, col = "red", pch = 18)

plot of chunk unnamed-chunk-9

sort(means, decreasing = TRUE)
##    IL    QC    BE    DE    FI    LK    US    NO    UK    CZ    MA    IE 
## 4.004 3.995 3.975 3.893 3.868 3.846 3.841 3.835 3.816 3.726 3.709 3.687 
##    DK    CH    SE    AT    CA    NZ    CO    PL    IT    FR    JP    NL 
## 3.684 3.678 3.652 3.631 3.569 3.563 3.556 3.509 3.500 3.497 3.469 3.465 
##    BR    AU    JM    LT    RU    ES    MX    CN    IN 
## 3.412 3.359 3.339 3.176 3.159 3.139 3.068 2.993 2.723

Group data by beer style


aggdata <- aggregate(frequent.countries.data, by = list(frequent.countries.data$beer_style), 
    FUN = mean, na.rm = TRUE)
head(aggdata[with(aggdata, order(-beer_abv)), ])
##                              Group.1 brewery_id      X brewery_name
## 42                           Eisbock       1465 972332           NA
## 43                English Barleywine       1857 873188           NA
## 102                        Wheatwine       2232 744630           NA
## 5                American Barleywine       2001 725890           NA
## 12  American Double / Imperial Stout       4819 595031           NA
## 29   Bière de Champagne / Bière Brut       1240 597545           NA
##     review_time review_overall review_aroma review_appearance
## 42    1.202e+09          3.952        4.189             3.948
## 43    1.231e+09          3.838        3.966             3.906
## 102   1.250e+09          3.822        3.987             3.916
## 5     1.226e+09          3.904        4.023             4.039
## 12    1.260e+09          4.032        4.163             4.167
## 29    1.251e+09          3.577        3.679             3.971
##     review_profilename beer_style review_palate review_taste beer_name
## 42                  NA         NA         4.083        4.208        NA
## 43                  NA         NA         3.914        3.979        NA
## 102                 NA         NA         3.954        3.987        NA
## 5                   NA         NA         4.000        4.057        NA
## 12                  NA         NA         4.101        4.188        NA
## 29                  NA         NA         3.697        3.602        NA
##     beer_abv beer_beerid country
## 42     11.32       12778      NA
## 43     11.02       23575      NA
## 102    10.73       40275      NA
## 5      10.70       23027      NA
## 12     10.63       34765      NA
## 29     10.49       37928      NA

Is there relationship between alcohol level and rating?

plot(aggdata$review_overall ~ aggdata$beer_abv, col = "blue")
lmfit <- lm(review_overall ~ beer_abv, aggdata)
abline(lmfit)

plot of chunk unnamed-chunk-11


summary(lmfit)
## 
## Call:
## lm(formula = review_overall ~ beer_abv, data = aggdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0834 -0.0600  0.0753  0.1727  0.4080 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3629     0.0986   34.09  < 2e-16 ***
## beer_abv      0.0579     0.0146    3.96  0.00014 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.289 on 102 degrees of freedom
## Multiple R-squared: 0.133,   Adjusted R-squared: 0.125 
## F-statistic: 15.7 on 1 and 102 DF,  p-value: 0.000139

par(mfrow = c(2, 2))
plot(lmfit)

plot of chunk unnamed-chunk-11

par(mfrow = c(1, 1))