1
0
Fork 0
mirror of https://github.com/MillironX/beefblup.git synced 2024-09-20 21:12:03 +00:00
beefblup/src/BeefBLUP.jl

319 lines
9.4 KiB
Julia
Raw Normal View History

2021-06-18 17:07:58 +00:00
# beefblup
# Julia package for performing single-variate BLUP to find beef cattle
2021-06-18 17:07:58 +00:00
# breeding values
2021-06-18 18:26:35 +00:00
# (C) 2021 Thomas A. Christensen II
2021-06-18 17:07:58 +00:00
# Licensed under BSD-3-Clause License
# cSpell:includeRegExp #.*
# cSpell:includeRegExp ("""|''')[^\1]*\1
module BeefBLUP
2021-06-18 17:07:58 +00:00
# Import the required packages
using CSV
using DataFrames
using LinearAlgebra
using Dates
using Gtk
# Main entry-level function - acts just like the script
function beefblup()
2021-06-19 23:27:52 +00:00
# Ask for an input spreadsheet
path = open_dialog_native(
2021-06-18 17:07:58 +00:00
"Select a beefblup worksheet",
GtkNullContainer(),
("*.csv", GtkFileFilter("*.csv", name="beefblup worksheet"))
2021-06-19 23:27:52 +00:00
)
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
# Ask for an output text filename
savepath = save_dialog_native(
2021-06-18 17:07:58 +00:00
"Save your beefblup results",
GtkNullContainer(),
(GtkFileFilter("*.txt", name="Results file"),
"*.txt")
2021-06-19 23:27:52 +00:00
)
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
# Ask for heritability
print("What is the heritability for this trait?> ")
h2 = parse(Float64, readline(stdin))
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
beefblup(path, savepath, h2)
2021-06-18 19:13:28 +00:00
end
2021-06-18 20:25:34 +00:00
function beefblup(datafile::String, h2::Float64)
# Assume the data is named the same as the file without the trailing extension
2021-06-19 23:27:52 +00:00
dataname = join(split(datafile, ".")[1:end - 1])
2021-06-18 20:25:34 +00:00
# Create a new results name
resultsfile = string(dataname, "_results.txt")
# Pass this info on to the worker
beefblup(datafile, resultsfile, h2)
end
2021-06-18 19:13:28 +00:00
# Main worker function, can perform all the work if given all the user input
function beefblup(path::String, savepath::String, h2::Float64)
2021-06-19 23:27:52 +00:00
# Import data from a suitable spreadsheet
data = DataFrame(CSV.File(path))
2021-06-18 17:07:58 +00:00
2021-08-31 23:27:16 +00:00
# Make sure the data is in the proper format
renamecolstospec!(data)
2021-06-19 23:27:52 +00:00
# Sort the array by date
sort!(data, :birthdate)
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
# Define fields to hold id values for animals and their parents
numanimals = length(data.id)
2021-06-18 17:07:58 +00:00
# Calculate the relationship matrix
A = additiverelationshipmatrix(data.id, data.dam, data.sire)
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
# Extract all of the fixed effects
fixedfx = select(data, Not([:id, :birthdate, :sire, :dam]))[:,1:end - 1]
2021-06-18 17:07:58 +00:00
(X, numgroups, normal, adjustedtraits) = fixedeffectmatrix(fixedfx)
2021-06-18 17:07:58 +00:00
2021-06-19 23:27:52 +00:00
# Extract the observed data
Y = convert(Array{Float64}, data[:,end])
# The random effects matrix
Z = Matrix{Int}(I, numanimals, numanimals)
# Remove items where there is no data
nullobs = findall(isnothing, Y)
Z[nullobs, nullobs] .= 0
# Calculate heritability
λ = (1 - h2) / h2
# Use the mixed-model equations
MME = [X' * X X' * Z; Z' * X (Z' * Z) + (inv(A) .* λ)]
MMY = [X' * Y; Z' * Y]
solutions = MME \ MMY
# Find the accuracies
diaginv = diag(inv(MME))
reliability = ones(Float64, length(diaginv)) - diaginv .* λ
# Find how many traits we found BLUE for
numgroups = numgroups .- 1
# Extract the names of the traits
fixedfxnames = names(fixedfx)
traitname = names(data)[end]
# Start printing results to output
fileID = open(savepath, "w")
write(fileID, "beefblup Results Report\n")
write(fileID, "Produced using beefblup (")
write(fileID, "https://github.com/millironx/beefblup")
write(fileID, ")\n\n")
write(fileID, "Input:\t")
write(fileID, path)
write(fileID, "\nAnalysis performed:\t")
write(fileID, string(Dates.today()))
write(fileID, "\nTrait examined:\t")
write(fileID, traitname)
write(fileID, "\n\n")
# Print base population stats
write(fileID, "Base Population:\n")
for i in 1:length(normal)
write(fileID, "\t")
write(fileID, fixedfxnames[i])
write(fileID, ":\t")
write(fileID, normal[i])
write(fileID, "\n")
end
write(fileID, "\tMean ")
write(fileID, traitname)
2021-06-18 17:07:58 +00:00
write(fileID, ":\t")
2021-06-19 23:27:52 +00:00
write(fileID, string(solutions[1]))
write(fileID, "\n\n")
# Contemporary group adjustments
counter = 2
write(fileID, "Contemporary Group Effects:\n")
for i in 1:length(numgroups)
write(fileID, "\t")
write(fileID, fixedfxnames[i])
write(fileID, "\tEffect\tReliability\n")
for j in 1:numgroups[i]
write(fileID, "\t")
write(fileID, adjustedtraits[counter - 1])
write(fileID, "\t")
write(fileID, string(solutions[counter]))
write(fileID, "\t")
write(fileID, string(reliability[counter]))
write(fileID, "\n")
counter = counter + 1
end
write(fileID, "\n")
end
2021-06-18 17:07:58 +00:00
write(fileID, "\n")
2021-06-19 23:27:52 +00:00
# Expected breeding values
write(fileID, "Expected Breeding Values:\n")
write(fileID, "\tID\tEBV\tReliability\n")
for i in 1:numanimals
2021-06-18 17:07:58 +00:00
write(fileID, "\t")
2021-06-19 23:27:52 +00:00
write(fileID, string(data.id[i]))
2021-06-18 17:07:58 +00:00
write(fileID, "\t")
2021-06-19 23:27:52 +00:00
write(fileID, string(solutions[i + counter - 1]))
2021-06-18 17:07:58 +00:00
write(fileID, "\t")
2021-06-19 23:27:52 +00:00
write(fileID, string(reliability[i + counter - 1]))
2021-06-18 17:07:58 +00:00
write(fileID, "\n")
end
2021-06-19 23:27:52 +00:00
write(fileID, "\n - END REPORT -")
close(fileID)
end
function fixedeffectmatrix(fixedeffects::AbstractDataFrame)
# Find any columns that need to be deleted
for i in 1:ncol(fixedeffects)
if length(unique(fixedeffects[:,i])) <= 1
@warn string("column '", names(fixedeffects)[i], "' does not have any unique animals and will be removed from this analysis")
DataFrames.select!(fixedeffects, Not(i))
end
end
# Determine how many contemporary groups there are
numtraits = ncol(fixedeffects)
numgroups = ones(1, numtraits)
for i in 1:numtraits
numgroups[i] = length(unique(fixedeffects[:,i]))
end
# If there are more groups than animals, then the analysis cannot continue
numanimals = length(fixedeffects[:,1])
if sum(numgroups) >= numanimals
throw(ErrorException("there are more contemporary groups than animals"))
end
# Define a "normal" animal as one of the last in the groups, provided that
# all traits do not have null values
numtraits = ncol(fixedeffects)
numanimals = length(fixedeffects[:,1])
normal = Array{String}(undef, 1, numtraits)
for i in 1:numtraits
for j in numanimals:-1:1
if !ismissing(fixedeffects[j,i])
normal[i] = string(fixedeffects[j,i])
break
end
end
end
# Form the fixed-effect matrix
X = zeros(Int8, numanimals, floor(Int, sum(numgroups)) - length(numgroups) + 1)
X[:,1] = ones(Int8, 1, numanimals)
# Create an external counter that will increment through both loops
counter = 2
# Store the traits in a string array
adjustedtraits =
Array{String}(undef,floor(Int, sum(numgroups)) - length(numgroups))
# Iterate through each group
for i in 1:length(normal)
# Find the traits that are present in this trait
localdata = string.(fixedeffects[:,i])
traits = unique(localdata)
# Remove the normal version from the analysis
effecttraits = traits[findall(x -> x != normal[i], traits)]
# Iterate inside of the group
for j in 1:(length(effecttraits))
matchedindex = findall(x -> x == effecttraits[j], localdata)
X[matchedindex, counter] .= 1
# Add this trait to the string
adjustedtraits[counter - 1] = traits[j]
# Increment the big counter
counter = counter + 1
end
end
return X, numgroups, normal, adjustedtraits
end
"""
additiverelationshipmatrix(id, dam, sire)
Returns the additive numerator relationship matrix based on the pedigree provided in `dam`
and `sire` for animals in `id`.
"""
function additiverelationshipmatrix(id::AbstractVector, damid::AbstractVector, sireid::AbstractVector)
# Sanity-check for valid pedigree
if !(length(id) == length(damid) && length(damid) == length(sireid))
throw(ArgumentError("id, dam, and sire must be of the same length"))
end
# Convert to positions
dam = indexin(damid, id)
sire = indexin(sireid, id)
# Calculate loop iterations
numanimals = length(dam)
# Create an empty matrix for the additive relationship matrix
A = zeros(numanimals, numanimals)
# Create the additive relationship matrix by the FORTRAN method presented by
# Henderson
for i in 1:numanimals
if !isnothing(dam[i]) && !isnothing(sire[i])
for j in 1:(i - 1)
A[j,i] = 0.5 * (A[j,sire[i]] + A[j,dam[i]])
A[i,j] = A[j,i]
end
A[i,i] = 1 + 0.5 * A[sire[i], dam[i]]
elseif !isnothing(dam[i]) && isnothing(sire[i])
for j in 1:(i - 1)
A[j,i] = 0.5 * A[j,dam[i]]
A[i,j] = A[j,i]
end
A[i,i] = 1
elseif isnothing(dam[i]) && !isnothing(sire[i])
for j in 1:(i - 1)
A[j,i] = 0.5 * A[j,sire[i]]
A[i,j] = A[j,i]
end
A[i,i] = 1
else
for j in 1:(i - 1)
A[j,i] = 0
A[i,j] = 0
end
A[i,i] = 1
end
end
return A
end
2021-08-31 23:27:16 +00:00
"""
renamecolstospec(::DataFrame)
Renames the first four columns of the beefblup data sheet so that they can be referred to by
name instead of by column index, regardless of user input.
"""
function renamecolstospec!(df::DataFrame)
# Pull out the fixed-effect and observation name
othernames = propertynames(df)[5:end]
# Put specification column names and user-defined names together
allnames = cat([:id, :birthdate, :dam, :sire], othernames, dims=1)
# Rename in the DataFrame
rename!(df, allnames, makeunique=true)
return df
end
end