Home » Software » R: advanced statistical package » Using lm() and predict() to apply a standard curve to Analytical Data
Using lm() and predict() to apply a standard curve to Analytical Data
Load input data (see attached files at bottom of this page)
#first the sample data
#note that field sep might be different based on pre-formatting
cn <- read.table("deb_pinn_C_N-raw.final.txt", sep=" ", header=TRUE)
#then the standards:
cn_std <- read.table("deb_pinn_C_N-standards.final.txt", sep="\t", header=TRUE)
# comput simple linear models from standards
# "mg_nitrogen as modeled by area under curve"
lm.N <- lm(mg_N ~ area_N, data=cn_std)
lm.C <- lm(mg_C ~ area_C, data=cn_std)
# check std curve stats:
summary(lm.N)
Multiple R-Squared: 0.9999, Adjusted R-squared: 0.9999
summary(lm.C)
Multiple R-Squared: 1, Adjusted R-squared: 1
Apply the standard curve to the raw measurements
# note that the predict method is looking for column names that where originally
# used in the creation of the lm object
# i.e. area_N for lm.N and area_C for lm.C
# therefore it is possible to pass the original data matrix with both
# values to predict(), while specifiying the lm object
cn$mg_N <- predict(lm.N, cn)
cn$mg_C <- predict(lm.C, cn)
Merge sample mass to calculate percent C/N by mass
#read in the initial mass data, note that by default string data will be read in as a factor
# i.e. factors are like treatments, and this data type will not work in some functions
cn.mass <- read.table("all_samples.masses.txt", header=TRUE, sep="\t")
#take a look at how the mass data was read in by read.table()
str(cn.mass)
'data.frame': 75 obs. of 5 variables:
$ id : <b>Factor</b> w/ 26 levels</b> "004K","007K",..: 15 16 17 18 19 20 21 22 23 24 ...
$ pedon_id : <b>Factor</b> w/ 18 levels "004K","007K",..: 15 15 15 18 18 18 17 17 17 16 ...
$ horizon_num: int 2 5 7 2 4 6 2 4 5 2 ...
$ sample_id : <b>Factor</b> w/ 75 levels "A1","A10","A11",..: 23 24 14 15 16 25 29 30 31 32 ...
$ sample_mg : num 24.6 27.5 33.3 25.9 25.8 ...
# use the merge() function to join the two dataframes based on the cell_id column
#merge() does not work with columns of type "level"
# convert them to characters in upper case, and append them to the original dataframe:
# note that merge is case sensitive !!!
cn$cell_id <- toupper(as.character(cn$sample_id))
cn.mass$cell_id <- toupper(as.character(cn.mass$sample_id))
#only keep our pedon data, leave behind the checks
cn.complete <- merge(x=cn, y=cn.mass, by.x="cell_id", by.y="cell_id", sort=FALSE, all.y=TRUE)
##calculate percent N and C, appending to the cn.complete dataframe
cn.complete$pct_N <- (cn.complete$mg_N / cn.complete$sample_mg) * 100
cn.complete$pct_C <- (cn.complete$mg_C / cn.complete$sample_mg) * 100
#look at the results:
str(cn.complete)
'data.frame': 75 obs. of 13 variables:
$ cell_id : chr "B8" "B9" "B10" "B11" ...
$ sample_id.x: Factor w/ 81 levels "A1","A10","A11",..: 24 25 15 16 17 26 30 31 32 33 ...
$ area_N : num 2225431 208028 341264 1377688 168328 ...
$ area_C : num 85307240 8296664 14624760 50879560 6690868 ...
$ mg_N : num 0.09261 0.01096 0.01635 0.05830 0.00935 ...
$ mg_C : num 1.2609 0.1204 0.2141 0.7510 0.0967 ...
$ id : Factor w/ 26 levels "004K","007K",..: 15 16 17 18 19 20 21 22 23 24 ...
$ pedon_id : Factor w/ 18 levels "004K","007K",..: 15 15 15 18 18 18 17 17 17 16 ...
$ horizon_num: int 2 5 7 2 4 6 2 4 5 2 ...
$ sample_id.y: Factor w/ 75 levels "A1","A10","A11",..: 23 24 14 15 16 25 29 30 31 32 ...
$ sample_mg : num 24.6 27.5 33.3 25.9 25.8 ...
$ pct_N : num 0.3759 0.0398 0.0491 0.2254 0.0363 ...
$ pct_C : num 5.117 0.438 0.643 2.903 0.375 ...
#save the data for further processing:
write.table(cn.complete, file="cn.complete.table", col.names=TRUE, row.names=FALSE)
Measure the accuracy of the sensor in the machine with simple correlation
### get a measure of how accurate the sensor was, based on our checks:
#just the first 5 columns, in case there is extra
cn.checks <- cn[c(13,26,39,52,65,78),][1:5]
#make a list of the mg of ACE in each check
checks.mg_ACE <- c(0.798, 1.588, 1.288, 1.574, 1.338, 1.191)
#make a column of the REAL mg_N based on the percent N in ACE
cn.checks$real_mg_N <- checks.mg_ACE * 0.104
#make a column of the REAL mg_C based on the percent C in ACE
cn.checks$real_mg_C <- checks.mg_ACE * 0.711
# check with cor()
Create a mutli-figure diagnostic plot
layout(mat=matrix(c(1, 4, 2, 3), nc = 2, nr = 2), width=c(1,1), height=c(1,2))
#first the std curves
par(mar = c(4,4,2,2))
#Nitrogen
plot(mg_N ~ area_N, data=cn_std, xlab="Area Counts", ylab="mg", main="Std Curve for N", cex=0.7, pch=16, cex.axis=0.6)
rug(cn$area_N, ticksize=0.02, col="gray")
rug(cn$mg_N, ticksize=0.02, col="gray", side=2)
abline(lm.N, col="gray", lty=2)
points(cn$area_N, cn$mg_N, col="blue", cex=0.2, pch=16)
#Carbon
plot(mg_C ~ area_C, data=cn_std, xlab="Area Counts", ylab="mg", main="Std Curve for C", cex=0.7, pch=16, cex.axis=0.6)
rug(cn$area_C, ticksize=0.02, col="gray")
rug(cn$mg_C, ticksize=0.02, col="gray", side=2)
abline(lm.C, col="gray", lty=2)
points(cn$area_C, cn$mg_C, col="blue", cex=0.2, pch=16)
#possible problems
points(cn$area_C[which(cn$area_C > 1.0e+08)], cn$mg_C[which(cn$area_C > 1.0e+08)] , col="red")
#sample plot of carbon distributions within each pedon:
#note that las=2 makes axis labels perpendicular to axis
par(mar = c(7,4,4,2))
#boxplot illustrating the within-pedon variation of Carbon
boxplot(cn.complete$pct_C ~ cn.complete$pedon_id , cex.axis=0.6, boxwex=0.2, las=2, main="Percent Total Carbon", ylab="% C", xlab="Pedon ID", cex=0.4)
#boxplot illustrating the within-pedon variation of Nitrogen
boxplot(cn.complete$pct_N ~ cn.complete$pedon_id , cex.axis=0.6, boxwex=0.2, las=2, main="Percent Total Nitrogen", ylab="% N", xlab="Pedon ID", cex=0.4)
Attachments:
deb_pinn_C_N-raw.final_.txt
deb_pinn_C_N-standards.final_.txt
all_samples.masses.txt
Software
- General Purpose Programming with Scripting Languages
- LaTeX Tips and Tricks
- PostGIS: Spatially enabled Relational Database Sytem
- PROJ: forward and reverse geographic projections
- GDAL and OGR: geodata conversion and re-projection tools
- R: advanced statistical package
- Access Data Stored in a Postgresql Database
- Additive Time Series Decomposition in R: Soil Moisture and Temperature Data
- Aggregating SSURGO Data in R
- Cluster Analysis 1: finding groups in a randomly generated 2-dimensional dataset
- Color Functions
- Comparison of Slope and Intercept Terms for Multi-Level Model
- Comparison of Slope and Intercept Terms for Multi-Level Model II: Using Contrasts
- Creating a Custom Panel Function (R - Lattice Graphics)
- Customized Scatterplot Ideas
- Estimating Missing Data with aregImpute() {R}
- Exploration of Multivariate Data
- Interactive 3D plots with the rgl package
- Making Soil Property vs. Depth Plots
- Numerical Integration/Differentiation in R: FTIR Spectra
- Plotting XRD (X-Ray Diffraction) Data
- Using lm() and predict() to apply a standard curve to Analytical Data
- Working with Spatial Data
- Comparison of PSA Results: Pipette vs. Laser Granulometer
- GRASS GIS: raster, vector, and imagery analysis
- Generic Mapping Tools: high quality map production