Probability Distribution

R Markdown

true data

library(ggplot2) # for graphics
library(MASS)
z <- read.table("fdata.csv",header=TRUE,sep=",")
str(z)

## 'data.frame':    1685 obs. of  5 variables:
##  $ farm.code   : chr  "GBN 8" "GBN 8" "GBN 8" "GBN 8" ...
##  $ number      : int  2 2 1 1 2 1 6 2 1 1 ...
##  $ area        : int  522 522 522 522 522 522 522 522 522 522 ...
##  $ N_trees     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ tree.density: num  0.00192 0.00192 0.00192 0.00192 0.00192 ...

summary(z)

##   farm.code             number            area          N_trees      
##  Length:1685        Min.   : 0.000   Min.   :  496   Min.   : 0.000  
##  Class :character   1st Qu.: 1.000   1st Qu.: 2035   1st Qu.: 0.000  
##  Mode  :character   Median : 1.000   Median : 2860   Median : 0.000  
##                     Mean   : 1.758   Mean   : 3541   Mean   : 1.258  
##                     3rd Qu.: 2.000   3rd Qu.: 4176   3rd Qu.: 1.000  
##                     Max.   :41.000   Max.   :15010   Max.   :14.000  
##   tree.density      
##  Min.   :0.0000000  
##  1st Qu.:0.0000000  
##  Median :0.0000000  
##  Mean   :0.0004429  
##  3rd Qu.:0.0004902  
##  Max.   :0.0034965

# getting rid of zeros
z <- z[z$tree.density >0,]
str(z)

## 'data.frame':    770 obs. of  5 variables:
##  $ farm.code   : chr  "GBN 8" "GBN 8" "GBN 8" "GBN 8" ...
##  $ number      : int  2 2 1 1 2 1 6 2 1 1 ...
##  $ area        : int  522 522 522 522 522 522 522 522 522 522 ...
##  $ N_trees     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ tree.density: num  0.00192 0.00192 0.00192 0.00192 0.00192 ...

summary(z$tree.density)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000977 0.0003565 0.0005447 0.0009693 0.0009091 0.0034965

p2 <- ggplot(data=z, aes(x=tree.density, y= ..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p2)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# add in a kernel density plot of the data
p2 <-  p2 +  geom_density(linetype="dotted",size=0.75)
print(p2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# fit a normal distribution to your data
normPars <- fitdistr(z$tree.density,"normal")
print(normPars)

##        mean            sd     
##   9.692889e-04   1.072675e-03 
##  (3.865654e-05) (2.733430e-05)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 0.000969 0.001073
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 3.87e-05 2.73e-05
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 1.49e-09 0.00 0.00 7.47e-10
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 770
##  $ loglik  : num 4172
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##         mean 
## 0.0009692889

# plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$tree.density),len=length(z$tree.density))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$tree.density), args = list(mean = meanML, sd = sdML))
p2 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# exponential probability density
expoPars <- fitdistr(z$tree.density,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$tree.density), args = list(rate=rateML))
p2 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$tree.density), args = list(min=min(z$tree.density), max=max(z$tree.density)))
p2 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# gamma probability density
gammaPars <- fitdistr(z$tree.density,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$tree.density), args = list(shape=shapeML, rate=rateML))
p2 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# beta probability density
pSpecial <- ggplot(data=z, aes(x=tree.density/(max(tree.density + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$tree.density/max(z$tree.density + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$tree.density), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

### simulated data using maximum likely parameters using rgamma as the best fit #############

myGamma <- rgamma(n=454,shape=1.24,rate= 1107.3)
myGamma <- data.frame(1:454, myGamma)
names(myGamma) <- list("ID","myVar")

# histogram plot of data
p1 <- ggplot(data=myGamma, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# adding probability density curve
statp1 <- stat_function(aes(x = myVar, y = ..y..), fun = dgamma, colour="brown", n = length(myGamma$myVar), args = list(shape=1.24, rate=1107.3))
 p1+statp1

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# comparing the distribution of original data vs simulated data

par(mfrow=c(1,2))
p2 + stat4 # original data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1+statp1 # simulated data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How do the two histogram profiles compare?

The two histogram depicts gamma distribution of density of trees in fields. The first one a real data while the second one is the simulated data. Both histograms are right skewed with majority of the data concentrated between 0.00 and 0.002.

Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

Yes, the model is did a good job simulating realistic data that matches the original data. The data shows the distribution of tree density in fields. Often, farmers are not motivated to keep trees within fields because trees create a canopy within the field and crops do not grow well under the shade. Farmers remove as many trees within fields to maximize the number seeds sown and also make up space for crops to grow. As a result, you are likely to find one to three trees within a field and the gamma distribution curve captures the picture of tree density in a real world.

Probability Distribution

Carlos Amissah

2024-02-26

R Markdown