true data
library(ggplot2) # for graphics
library(MASS)
z <- read.table("fdata.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 1685 obs. of 5 variables:
## $ farm.code : chr "GBN 8" "GBN 8" "GBN 8" "GBN 8" ...
## $ number : int 2 2 1 1 2 1 6 2 1 1 ...
## $ area : int 522 522 522 522 522 522 522 522 522 522 ...
## $ N_trees : int 1 1 1 1 1 1 1 1 1 1 ...
## $ tree.density: num 0.00192 0.00192 0.00192 0.00192 0.00192 ...
summary(z)
## farm.code number area N_trees
## Length:1685 Min. : 0.000 Min. : 496 Min. : 0.000
## Class :character 1st Qu.: 1.000 1st Qu.: 2035 1st Qu.: 0.000
## Mode :character Median : 1.000 Median : 2860 Median : 0.000
## Mean : 1.758 Mean : 3541 Mean : 1.258
## 3rd Qu.: 2.000 3rd Qu.: 4176 3rd Qu.: 1.000
## Max. :41.000 Max. :15010 Max. :14.000
## tree.density
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0004429
## 3rd Qu.:0.0004902
## Max. :0.0034965
# getting rid of zeros
z <- z[z$tree.density >0,]
str(z)
## 'data.frame': 770 obs. of 5 variables:
## $ farm.code : chr "GBN 8" "GBN 8" "GBN 8" "GBN 8" ...
## $ number : int 2 2 1 1 2 1 6 2 1 1 ...
## $ area : int 522 522 522 522 522 522 522 522 522 522 ...
## $ N_trees : int 1 1 1 1 1 1 1 1 1 1 ...
## $ tree.density: num 0.00192 0.00192 0.00192 0.00192 0.00192 ...
summary(z$tree.density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000977 0.0003565 0.0005447 0.0009693 0.0009091 0.0034965
p2 <- ggplot(data=z, aes(x=tree.density, y= ..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p2)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# add in a kernel density plot of the data
p2 <- p2 + geom_density(linetype="dotted",size=0.75)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# fit a normal distribution to your data
normPars <- fitdistr(z$tree.density,"normal")
print(normPars)
## mean sd
## 9.692889e-04 1.072675e-03
## (3.865654e-05) (2.733430e-05)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.000969 0.001073
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 3.87e-05 2.73e-05
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 1.49e-09 0.00 0.00 7.47e-10
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 770
## $ loglik : num 4172
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.0009692889
# plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$tree.density),len=length(z$tree.density))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$tree.density), args = list(mean = meanML, sd = sdML))
p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# exponential probability density
expoPars <- fitdistr(z$tree.density,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$tree.density), args = list(rate=rateML))
p2 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$tree.density), args = list(min=min(z$tree.density), max=max(z$tree.density)))
p2 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# gamma probability density
gammaPars <- fitdistr(z$tree.density,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$tree.density), args = list(shape=shapeML, rate=rateML))
p2 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# beta probability density
pSpecial <- ggplot(data=z, aes(x=tree.density/(max(tree.density + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$tree.density/max(z$tree.density + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$tree.density), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
### simulated data using maximum likely parameters using rgamma as the best fit #############
myGamma <- rgamma(n=454,shape=1.24,rate= 1107.3)
myGamma <- data.frame(1:454, myGamma)
names(myGamma) <- list("ID","myVar")
# histogram plot of data
p1 <- ggplot(data=myGamma, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# adding probability density curve
statp1 <- stat_function(aes(x = myVar, y = ..y..), fun = dgamma, colour="brown", n = length(myGamma$myVar), args = list(shape=1.24, rate=1107.3))
p1+statp1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# comparing the distribution of original data vs simulated data
par(mfrow=c(1,2))
p2 + stat4 # original data
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1+statp1 # simulated data
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
How do the two histogram profiles compare?
The two histogram depicts gamma distribution of density of trees in fields. The first one a real data while the second one is the simulated data. Both histograms are right skewed with majority of the data concentrated between 0.00 and 0.002.
Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?
Yes, the model is did a good job simulating realistic data that matches the original data. The data shows the distribution of tree density in fields. Often, farmers are not motivated to keep trees within fields because trees create a canopy within the field and crops do not grow well under the shade. Farmers remove as many trees within fields to maximize the number seeds sown and also make up space for crops to grow. As a result, you are likely to find one to three trees within a field and the gamma distribution curve captures the picture of tree density in a real world.