library(descr)
library(plyr)
Read the data in and save the raw data as an RDA file. Create smaller sample.
wd <- getwd()
# beerReviews <- read.csv('../../data/beer_reviews.csv')
breweries <- read.csv("../../data/breweries.csv")
# save(beerReviews,file='../../data/beer_reviews.rda')
save(breweries, file = "../../data/breweries.rda")
Current working directory id “/home/jakub/R/beer/code/rewcode”.
Subset review dataset
# set.seed(1) subset<-beerReviews[sample(1:nrow(beerReviews),
# size=200000),] save(subset, file='../../data/reviews_subset.rda')
# write.csv(subset, file='../../data/reviews_subset.csv')
subset <- read.csv("../../data/reviews_subset.csv")
Reviews
dim(subset)
## [1] 200000 14
head(subset)
## X brewery_id brewery_name review_time
## 1 421260 73 Great Lakes Brewing Company 1313022603
## 2 590417 1549 Goose Island Beer Co. 1274563790
## 3 908897 142 Spaten-Franziskaner-Bräu 1149727060
## 4 1440973 63 Deschutes Brewery 1237525967
## 5 319991 35 Boston Beer Company (Samuel Adams) 1274749248
## 6 1425394 385 Midnight Sun Brewing Co. 1301966988
## review_overall review_aroma review_appearance review_profilename
## 1 4.0 3.5 4.5 Fuzzy1
## 2 4.5 4.5 5.0 cpetrone84
## 3 4.0 3.0 4.0 jmz62565
## 4 4.0 4.5 3.0 sevopie
## 5 4.5 4.0 4.5 klewis
## 6 4.5 4.5 4.0 glid02
## beer_style review_palate review_taste
## 1 American Pale Ale (APA) 4.0 4.0
## 2 American Double / Imperial Stout 4.5 4.5
## 3 Doppelbock 4.0 3.5
## 4 American Porter 3.5 3.5
## 5 American Barleywine 4.5 4.0
## 6 English Barleywine 4.5 5.0
## beer_name beer_abv beer_beerid
## 1 Great Lakes Burning River Pale Ale 6.0 225
## 2 Bourbon County Brand Coffee Stout 13.0 57747
## 3 Spaten Optimator 7.6 99
## 4 Black Butte Porter 5.2 198
## 5 LongShot Mile High Barley Wine Ale 9.8 57272
## 6 Arctic Devil Barley Wine 13.2 18093
summary(subset)
## X brewery_id
## Min. : 4 Min. : 1
## 1st Qu.: 396209 1st Qu.: 143
## Median : 794920 Median : 423
## Mean : 793427 Mean : 3115
## 3rd Qu.:1190326 3rd Qu.: 2372
## Max. :1586610 Max. :28003
##
## brewery_name review_time
## Boston Beer Company (Samuel Adams): 4883 Min. :8.85e+08
## Stone Brewing Co. : 4287 1st Qu.:1.17e+09
## Dogfish Head Brewery : 4241 Median :1.24e+09
## Sierra Nevada Brewing Co. : 3596 Mean :1.22e+09
## Bell's Brewery, Inc. : 3194 3rd Qu.:1.29e+09
## Rogue Ales : 2938 Max. :1.33e+09
## (Other) :176861
## review_overall review_aroma review_appearance review_profilename
## Min. :0.00 Min. :1.00 Min. :0.00 northyorksammy: 716
## 1st Qu.:3.50 1st Qu.:3.50 1st Qu.:3.50 mikesgroove : 586
## Median :4.00 Median :4.00 Median :4.00 BuckeyeNation : 546
## Mean :3.82 Mean :3.74 Mean :3.84 NeroFiddled : 441
## 3rd Qu.:4.50 3rd Qu.:4.00 3rd Qu.:4.00 womencantsail : 428
## Max. :5.00 Max. :5.00 Max. :5.00 Thorpe429 : 425
## (Other) :196858
## beer_style review_palate review_taste
## American IPA : 14726 Min. :1.00 Min. :1.00
## American Double / Imperial IPA : 10903 1st Qu.:3.50 1st Qu.:3.50
## American Pale Ale (APA) : 8198 Median :4.00 Median :4.00
## Russian Imperial Stout : 6786 Mean :3.75 Mean :3.79
## American Porter : 6384 3rd Qu.:4.00 3rd Qu.:4.50
## American Double / Imperial Stout: 6356 Max. :5.00 Max. :5.00
## (Other) :146647
## beer_name beer_abv
## 90 Minute IPA : 413 Min. : 0.05
## Stone Ruination IPA : 404 1st Qu.: 5.20
## India Pale Ale : 387 Median : 6.50
## Old Rasputin Russian Imperial Stout: 387 Mean : 7.05
## Sierra Nevada Celebration Ale : 366 3rd Qu.: 8.50
## Sierra Nevada Pale Ale : 349 Max. : 41.00
## (Other) :197694 NA's :8496.00
## beer_beerid
## Min. : 3
## 1st Qu.: 1717
## Median :13906
## Mean :21714
## 3rd Qu.:39441
## Max. :77317
##
sapply(subset[1, ], class)
## X brewery_id brewery_name
## "integer" "integer" "factor"
## review_time review_overall review_aroma
## "integer" "numeric" "numeric"
## review_appearance review_profilename beer_style
## "numeric" "factor" "factor"
## review_palate review_taste beer_name
## "numeric" "numeric" "factor"
## beer_abv beer_beerid
## "numeric" "integer"
Breweries
dim(breweries)
## [1] 4567 2
head(breweries)
## id country
## 1 18115 RO
## 2 2186 DE
## 3 16792 US
## 4 11618 US
## 5 10541 MX
## 6 15756 DE
summary(breweries)
## id country
## Min. : 1 US :1750
## 1st Qu.: 2552 DE : 603
## Median : 8590 UK : 284
## Mean :10388 CA : 198
## 3rd Qu.:17092 AU : 183
## Max. :28003 (Other):1547
## NA's : 2
sapply(breweries[1, ], class)
## id country
## "integer" "factor"
data <- merge(subset, breweries, by.x = "brewery_id", by.y = "id", all = TRUE)
dim(data)
## [1] 200959 15
head(data)
## brewery_id X brewery_name review_time review_overall
## 1 1 46773 Plzensky Prazdroj, a. s. 1111822280 2.5
## 2 1 46100 Plzensky Prazdroj, a. s. 1246908992 3.0
## 3 1 46422 Plzensky Prazdroj, a. s. 1191757110 3.5
## 4 1 46728 Plzensky Prazdroj, a. s. 1125889212 3.0
## 5 1 46068 Plzensky Prazdroj, a. s. 1252816687 5.0
## 6 1 46263 Plzensky Prazdroj, a. s. 1218368958 4.5
## review_aroma review_appearance review_profilename beer_style
## 1 2.0 4.0 hefevice Czech Pilsener
## 2 3.5 3.5 civilizedpsycho Czech Pilsener
## 3 3.5 3.0 armock Czech Pilsener
## 4 2.0 3.0 BigBry Czech Pilsener
## 5 4.0 4.0 jsh420 Czech Pilsener
## 6 4.0 4.5 TurdFurgison Czech Pilsener
## review_palate review_taste beer_name beer_abv beer_beerid country
## 1 2.5 2.5 Pilsner Urquell 4.4 429 CZ
## 2 2.5 3.0 Pilsner Urquell 4.4 429 CZ
## 3 3.5 3.5 Pilsner Urquell 4.4 429 CZ
## 4 3.0 3.5 Pilsner Urquell 4.4 429 CZ
## 5 4.0 4.0 Pilsner Urquell 4.4 429 CZ
## 6 4.0 4.0 Pilsner Urquell 4.4 429 CZ
summary(data)
## brewery_id X
## Min. : 1 Min. : 4
## 1st Qu.: 144 1st Qu.: 396209
## Median : 429 Median : 794920
## Mean : 3169 Mean : 793427
## 3rd Qu.: 2431 3rd Qu.:1190326
## Max. :28003 Max. :1586610
## NA's : 959
## brewery_name review_time
## Boston Beer Company (Samuel Adams): 4883 Min. :8.85e+08
## Stone Brewing Co. : 4287 1st Qu.:1.17e+09
## Dogfish Head Brewery : 4241 Median :1.24e+09
## Sierra Nevada Brewing Co. : 3596 Mean :1.22e+09
## Bell's Brewery, Inc. : 3194 3rd Qu.:1.29e+09
## (Other) :179799 Max. :1.33e+09
## NA's : 959 NA's :9.59e+02
## review_overall review_aroma review_appearance
## Min. : 0.00 Min. : 1.00 Min. : 0.00
## 1st Qu.: 3.50 1st Qu.: 3.50 1st Qu.: 3.50
## Median : 4.00 Median : 4.00 Median : 4.00
## Mean : 3.82 Mean : 3.74 Mean : 3.84
## 3rd Qu.: 4.50 3rd Qu.: 4.00 3rd Qu.: 4.00
## Max. : 5.00 Max. : 5.00 Max. : 5.00
## NA's :959.00 NA's :959.00 NA's :959.00
## review_profilename beer_style
## northyorksammy: 716 American IPA : 14726
## mikesgroove : 586 American Double / Imperial IPA: 10903
## BuckeyeNation : 546 American Pale Ale (APA) : 8198
## NeroFiddled : 441 Russian Imperial Stout : 6786
## womencantsail : 428 American Porter : 6384
## (Other) :197283 (Other) :153003
## NA's : 959 NA's : 959
## review_palate review_taste
## Min. : 1.00 Min. : 1.00
## 1st Qu.: 3.50 1st Qu.: 3.50
## Median : 4.00 Median : 4.00
## Mean : 3.75 Mean : 3.79
## 3rd Qu.: 4.00 3rd Qu.: 4.50
## Max. : 5.00 Max. : 5.00
## NA's :959.00 NA's :959.00
## beer_name beer_abv
## 90 Minute IPA : 413 Min. : 0.05
## Stone Ruination IPA : 404 1st Qu.: 5.20
## India Pale Ale : 387 Median : 6.50
## Old Rasputin Russian Imperial Stout: 387 Mean : 7.05
## Sierra Nevada Celebration Ale : 366 3rd Qu.: 8.50
## (Other) :198043 Max. : 41.00
## NA's : 959 NA's :9455.00
## beer_beerid country
## Min. : 3 US :135219
## 1st Qu.: 1717 BE : 15026
## Median :13906 DE : 12653
## Mean :21714 UK : 9054
## 3rd Qu.:39441 CA : 3970
## Max. :77317 (Other): 21279
## NA's : 959 NA's : 3758
sapply(data[1, ], class)
## brewery_id X brewery_name
## "integer" "integer" "factor"
## review_time review_overall review_aroma
## "integer" "numeric" "numeric"
## review_appearance review_profilename beer_style
## "numeric" "factor" "factor"
## review_palate review_taste beer_name
## "numeric" "numeric" "factor"
## beer_abv beer_beerid country
## "numeric" "integer" "factor"
sum(is.na(data))
## [1] 24721
table(data$country)
##
## AL AM AR AT AU AW AZ BA BB BE
## 5 13 74 531 1352 8 1 9 9 15026
## BG BM BO BR BS BT BY BZ CA CH
## 28 4 10 243 25 1 5 10 3970 179
## CI CL CN CO CR CU CV CY CZ DE
## 1 8 153 549 26 7 1 10 768 12653
## DK DM DO DZ EC EE EG ES ET FI
## 1600 2 31 1 7 35 3 170 22 128
## FO FR GA GE GG GH GL GP GR GT
## 1 820 8 7 1 3 2 1 69 27
## GU HK HN HR HT HU ID IE IL IN
## 1 2 8 32 6 21 6 1369 1818 131
## IS IT JM JP KE KH KR KY KZ LA
## 25 642 165 880 40 2 34 7 12 10
## LB LC LK LT LU LV MA MC MD ME
## 21 7 127 129 11 37 1129 1 2 7
## MG MI MK MM MN MO MQ MT MU MX
## 1 3 2 3 7 3 1 16 1 852
## MY NG NI NL NO NP NY NZ OR PA
## 1 6 15 1290 568 1 1 322 5 7
## PE PF PG PH PK PL PR PS PT QC
## 32 8 1 71 1 461 8 1 42 2434
## RO RS RU SE SG SI SK SM SN SV
## 26 7 431 409 63 15 37 1 1 48
## TC TH TM TN TR TT TW TZ UA UG
## 1 99 1 1 43 41 9 2 88 1
## UK US UY UZ VE VI VN VU WA WS
## 9054 135219 5 3 7 23 36 1 33 1
## ZA ZW
## 30 2
freq(data$country, cex.names = 0.35)
## data$country
## Frequency Percent Valid Percent
## AL 5 2.488e-03 2.535e-03
## AM 13 6.469e-03 6.592e-03
## AR 74 3.682e-02 3.753e-02
## AT 531 2.642e-01 2.693e-01
## AU 1352 6.728e-01 6.856e-01
## AW 8 3.981e-03 4.057e-03
## AZ 1 4.976e-04 5.071e-04
## BA 9 4.479e-03 4.564e-03
## BB 9 4.479e-03 4.564e-03
## BE 15026 7.477e+00 7.620e+00
## BG 28 1.393e-02 1.420e-02
## BM 4 1.990e-03 2.028e-03
## BO 10 4.976e-03 5.071e-03
## BR 243 1.209e-01 1.232e-01
## BS 25 1.244e-02 1.268e-02
## BT 1 4.976e-04 5.071e-04
## BY 5 2.488e-03 2.535e-03
## BZ 10 4.976e-03 5.071e-03
## CA 3970 1.976e+00 2.013e+00
## CH 179 8.907e-02 9.077e-02
## CI 1 4.976e-04 5.071e-04
## CL 8 3.981e-03 4.057e-03
## CN 153 7.613e-02 7.759e-02
## CO 549 2.732e-01 2.784e-01
## CR 26 1.294e-02 1.318e-02
## CU 7 3.483e-03 3.550e-03
## CV 1 4.976e-04 5.071e-04
## CY 10 4.976e-03 5.071e-03
## CZ 768 3.822e-01 3.895e-01
## DE 12653 6.296e+00 6.416e+00
## DK 1600 7.962e-01 8.114e-01
## DM 2 9.952e-04 1.014e-03
## DO 31 1.543e-02 1.572e-02
## DZ 1 4.976e-04 5.071e-04
## EC 7 3.483e-03 3.550e-03
## EE 35 1.742e-02 1.775e-02
## EG 3 1.493e-03 1.521e-03
## ES 170 8.459e-02 8.621e-02
## ET 22 1.095e-02 1.116e-02
## FI 128 6.369e-02 6.491e-02
## FO 1 4.976e-04 5.071e-04
## FR 820 4.080e-01 4.158e-01
## GA 8 3.981e-03 4.057e-03
## GE 7 3.483e-03 3.550e-03
## GG 1 4.976e-04 5.071e-04
## GH 3 1.493e-03 1.521e-03
## GL 2 9.952e-04 1.014e-03
## GP 1 4.976e-04 5.071e-04
## GR 69 3.434e-02 3.499e-02
## GT 27 1.344e-02 1.369e-02
## GU 1 4.976e-04 5.071e-04
## HK 2 9.952e-04 1.014e-03
## HN 8 3.981e-03 4.057e-03
## HR 32 1.592e-02 1.623e-02
## HT 6 2.986e-03 3.043e-03
## HU 21 1.045e-02 1.065e-02
## ID 6 2.986e-03 3.043e-03
## IE 1369 6.812e-01 6.942e-01
## IL 1818 9.047e-01 9.219e-01
## IN 131 6.519e-02 6.643e-02
## IS 25 1.244e-02 1.268e-02
## IT 642 3.195e-01 3.256e-01
## JM 165 8.211e-02 8.367e-02
## JP 880 4.379e-01 4.462e-01
## KE 40 1.990e-02 2.028e-02
## KH 2 9.952e-04 1.014e-03
## KR 34 1.692e-02 1.724e-02
## KY 7 3.483e-03 3.550e-03
## KZ 12 5.971e-03 6.085e-03
## LA 10 4.976e-03 5.071e-03
## LB 21 1.045e-02 1.065e-02
## LC 7 3.483e-03 3.550e-03
## LK 127 6.320e-02 6.440e-02
## LT 129 6.419e-02 6.542e-02
## LU 11 5.474e-03 5.578e-03
## LV 37 1.841e-02 1.876e-02
## MA 1129 5.618e-01 5.725e-01
## MC 1 4.976e-04 5.071e-04
## MD 2 9.952e-04 1.014e-03
## ME 7 3.483e-03 3.550e-03
## MG 1 4.976e-04 5.071e-04
## MI 3 1.493e-03 1.521e-03
## MK 2 9.952e-04 1.014e-03
## MM 3 1.493e-03 1.521e-03
## MN 7 3.483e-03 3.550e-03
## MO 3 1.493e-03 1.521e-03
## MQ 1 4.976e-04 5.071e-04
## MT 16 7.962e-03 8.114e-03
## MU 1 4.976e-04 5.071e-04
## MX 852 4.240e-01 4.320e-01
## MY 1 4.976e-04 5.071e-04
## NG 6 2.986e-03 3.043e-03
## NI 15 7.464e-03 7.606e-03
## NL 1290 6.419e-01 6.542e-01
## NO 568 2.826e-01 2.880e-01
## NP 1 4.976e-04 5.071e-04
## NY 1 4.976e-04 5.071e-04
## NZ 322 1.602e-01 1.633e-01
## OR 5 2.488e-03 2.535e-03
## PA 7 3.483e-03 3.550e-03
## PE 32 1.592e-02 1.623e-02
## PF 8 3.981e-03 4.057e-03
## PG 1 4.976e-04 5.071e-04
## PH 71 3.533e-02 3.600e-02
## PK 1 4.976e-04 5.071e-04
## PL 461 2.294e-01 2.338e-01
## PR 8 3.981e-03 4.057e-03
## PS 1 4.976e-04 5.071e-04
## PT 42 2.090e-02 2.130e-02
## QC 2434 1.211e+00 1.234e+00
## RO 26 1.294e-02 1.318e-02
## RS 7 3.483e-03 3.550e-03
## RU 431 2.145e-01 2.186e-01
## SE 409 2.035e-01 2.074e-01
## SG 63 3.135e-02 3.195e-02
## SI 15 7.464e-03 7.606e-03
## SK 37 1.841e-02 1.876e-02
## SM 1 4.976e-04 5.071e-04
## SN 1 4.976e-04 5.071e-04
## SV 48 2.389e-02 2.434e-02
## TC 1 4.976e-04 5.071e-04
## TH 99 4.926e-02 5.020e-02
## TM 1 4.976e-04 5.071e-04
## TN 1 4.976e-04 5.071e-04
## TR 43 2.140e-02 2.181e-02
## TT 41 2.040e-02 2.079e-02
## TW 9 4.479e-03 4.564e-03
## TZ 2 9.952e-04 1.014e-03
## UA 88 4.379e-02 4.462e-02
## UG 1 4.976e-04 5.071e-04
## UK 9054 4.505e+00 4.591e+00
## US 135219 6.729e+01 6.857e+01
## UY 5 2.488e-03 2.535e-03
## UZ 3 1.493e-03 1.521e-03
## VE 7 3.483e-03 3.550e-03
## VI 23 1.145e-02 1.166e-02
## VN 36 1.791e-02 1.826e-02
## VU 1 4.976e-04 5.071e-04
## WA 33 1.642e-02 1.673e-02
## WS 1 4.976e-04 5.071e-04
## ZA 30 1.493e-02 1.521e-02
## ZW 2 9.952e-04 1.014e-03
## NA's 3758 1.870e+00
## Total 200959 1.000e+02 1.000e+02
country.freq <- table(data$country)
sort(country.freq, decreasing = TRUE)
##
## US BE DE UK CA QC IL DK IE AU
## 135219 15026 12653 9054 3970 2434 1818 1600 1369 1352
## NL MA JP MX FR CZ IT NO CO AT
## 1290 1129 880 852 820 768 642 568 549 531
## PL RU SE NZ BR CH ES JM CN IN
## 461 431 409 322 243 179 170 165 153 131
## LT FI LK TH UA AR PH GR SG SV
## 129 128 127 99 88 74 71 69 63 48
## TR PT TT KE LV SK VN EE KR WA
## 43 42 41 40 37 37 36 35 34 33
## HR PE DO ZA BG GT CR RO BS IS
## 32 32 31 30 28 27 26 26 25 25
## VI ET HU LB MT NI SI AM KZ LU
## 23 22 21 21 16 15 15 13 12 11
## BO BZ CY LA BA BB TW AW CL GA
## 10 10 10 10 9 9 9 8 8 8
## HN PF PR CU EC GE KY LC ME MN
## 8 8 8 7 7 7 7 7 7 7
## PA RS VE HT ID NG AL BY OR UY
## 7 7 7 6 6 6 5 5 5 5
## BM EG GH MI MM MO UZ DM GL HK
## 4 3 3 3 3 3 3 2 2 2
## KH MD MK TZ ZW AZ BT CI CV DZ
## 2 2 2 2 2 1 1 1 1 1
## FO GG GP GU MC MG MQ MU MY NP
## 1 1 1 1 1 1 1 1 1 1
## NY PG PK PS SM SN TC TM TN UG
## 1 1 1 1 1 1 1 1 1 1
## VU WS
## 1 1
length(country.freq) # pocet vsetkych krajin
## [1] 142
country.freq.border <- 100
sum(country.freq > country.freq.border) #pocet krtajin, ktore maju aspon 20 hodnoteni
## [1] 33
frequent.countries <- names(country.freq)[country.freq > country.freq.border]
nrow(data) #pocet vsetkych hodnoteni
## [1] 200959
frequent.countries.data <- subset(data, data$country %in% frequent.countries)
frequent.countries.data$country <- factor(frequent.countries.data$country, levels = frequent.countries)
nrow(frequent.countries.data) # pocet hodnoteni z krajin, ktore maju dost hodnoteni
## [1] 195572
freq(frequent.countries.data$country, cex.names = 0.4)
## frequent.countries.data$country
## Frequency Percent
## AT 531 0.27151
## AU 1352 0.69131
## BE 15026 7.68310
## BR 243 0.12425
## CA 3970 2.02994
## CH 179 0.09153
## CN 153 0.07823
## CO 549 0.28072
## CZ 768 0.39269
## DE 12653 6.46974
## DK 1600 0.81811
## ES 170 0.08692
## FI 128 0.06545
## FR 820 0.41928
## IE 1369 0.70000
## IL 1818 0.92958
## IN 131 0.06698
## IT 642 0.32827
## JM 165 0.08437
## JP 880 0.44996
## LK 127 0.06494
## LT 129 0.06596
## MA 1129 0.57728
## MX 852 0.43565
## NL 1290 0.65960
## NO 568 0.29043
## NZ 322 0.16465
## PL 461 0.23572
## QC 2434 1.24455
## RU 431 0.22038
## SE 409 0.20913
## UK 9054 4.62950
## US 135219 69.14027
## Total 195572 100.00000
boxplot(review_overall ~ country, data = frequent.countries.data, cex.axis = 0.3)
means <- tapply(frequent.countries.data$review_overall, frequent.countries.data$country,
mean, na.rm = TRUE)
points(means, col = "red", pch = 18)
sort(means, decreasing = TRUE)
## IL QC BE DE FI LK US NO UK CZ MA IE
## 4.004 3.995 3.975 3.893 3.868 3.846 3.841 3.835 3.816 3.726 3.709 3.687
## DK CH SE AT CA NZ CO PL IT FR JP NL
## 3.684 3.678 3.652 3.631 3.569 3.563 3.556 3.509 3.500 3.497 3.469 3.465
## BR AU JM LT RU ES MX CN IN
## 3.412 3.359 3.339 3.176 3.159 3.139 3.068 2.993 2.723
aggdata <- aggregate(frequent.countries.data, by = list(frequent.countries.data$beer_style),
FUN = mean, na.rm = TRUE)
head(aggdata[with(aggdata, order(-beer_abv)), ])
## Group.1 brewery_id X brewery_name
## 42 Eisbock 1465 972332 NA
## 43 English Barleywine 1857 873188 NA
## 102 Wheatwine 2232 744630 NA
## 5 American Barleywine 2001 725890 NA
## 12 American Double / Imperial Stout 4819 595031 NA
## 29 Bière de Champagne / Bière Brut 1240 597545 NA
## review_time review_overall review_aroma review_appearance
## 42 1.202e+09 3.952 4.189 3.948
## 43 1.231e+09 3.838 3.966 3.906
## 102 1.250e+09 3.822 3.987 3.916
## 5 1.226e+09 3.904 4.023 4.039
## 12 1.260e+09 4.032 4.163 4.167
## 29 1.251e+09 3.577 3.679 3.971
## review_profilename beer_style review_palate review_taste beer_name
## 42 NA NA 4.083 4.208 NA
## 43 NA NA 3.914 3.979 NA
## 102 NA NA 3.954 3.987 NA
## 5 NA NA 4.000 4.057 NA
## 12 NA NA 4.101 4.188 NA
## 29 NA NA 3.697 3.602 NA
## beer_abv beer_beerid country
## 42 11.32 12778 NA
## 43 11.02 23575 NA
## 102 10.73 40275 NA
## 5 10.70 23027 NA
## 12 10.63 34765 NA
## 29 10.49 37928 NA
plot(aggdata$review_overall ~ aggdata$beer_abv, col = "blue")
lmfit <- lm(review_overall ~ beer_abv, aggdata)
abline(lmfit)
summary(lmfit)
##
## Call:
## lm(formula = review_overall ~ beer_abv, data = aggdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0834 -0.0600 0.0753 0.1727 0.4080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3629 0.0986 34.09 < 2e-16 ***
## beer_abv 0.0579 0.0146 3.96 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.289 on 102 degrees of freedom
## Multiple R-squared: 0.133, Adjusted R-squared: 0.125
## F-statistic: 15.7 on 1 and 102 DF, p-value: 0.000139
par(mfrow = c(2, 2))
plot(lmfit)
par(mfrow = c(1, 1))