# beefblup # Julia package for performing single-variate BLUP to find beef cattle # breeding values # (C) 2021 Thomas A. Christensen II # Licensed under BSD-3-Clause License # cSpell:includeRegExp #.* # cSpell:includeRegExp ("""|''')[^\1]*\1 module BeefBLUP # Import the required packages using CSV using DataFrames using LinearAlgebra using Dates using Gtk # Main entry-level function - acts just like the script function beefblup() # Ask for an input spreadsheet path = open_dialog_native( "Select a beefblup worksheet", GtkNullContainer(), ("*.csv", GtkFileFilter("*.csv", name="beefblup worksheet")) ) # Ask for an output text filename savepath = save_dialog_native( "Save your beefblup results", GtkNullContainer(), (GtkFileFilter("*.txt", name="Results file"), "*.txt") ) # Ask for heritability print("What is the heritability for this trait?> ") h2 = parse(Float64, readline(stdin)) beefblup(path, savepath, h2) end function beefblup(datafile::String, h2::Float64) # Assume the data is named the same as the file without the trailing extension dataname = join(split(datafile, ".")[1:end - 1]) # Create a new results name resultsfile = string(dataname, "_results.txt") # Pass this info on to the worker beefblup(datafile, resultsfile, h2) end # Main worker function, can perform all the work if given all the user input function beefblup(path::String, savepath::String, h2::Float64) # Import data from a suitable spreadsheet data = DataFrame(CSV.File(path)) # Make sure the data is in the proper format renamecolstospec!(data) # Sort the array by date sort!(data, :birthdate) # Define fields to hold id values for animals and their parents numanimals = length(data.id) # Calculate the relationship matrix A = additiverelationshipmatrix(data.id, data.dam, data.sire) # Extract all of the fixed effects fixedfx = select(data, Not([:id, :birthdate, :sire, :dam]))[:,1:end - 1] (X, numgroups, normal, adjustedtraits) = fixedeffectmatrix(fixedfx) # Extract the observed data Y = convert(Array{Float64}, data[:,end]) # The random effects matrix Z = Matrix{Int}(I, numanimals, numanimals) # Remove items where there is no data nullobs = findall(isnothing, Y) Z[nullobs, nullobs] .= 0 # Calculate heritability λ = (1 - h2) / h2 # Use the mixed-model equations MME = [X' * X X' * Z; Z' * X (Z' * Z) + (inv(A) .* λ)] MMY = [X' * Y; Z' * Y] solutions = MME \ MMY # Find the accuracies diaginv = diag(inv(MME)) reliability = ones(Float64, length(diaginv)) - diaginv .* λ # Find how many traits we found BLUE for numgroups = numgroups .- 1 # Extract the names of the traits fixedfxnames = names(fixedfx) traitname = names(data)[end] # Start printing results to output fileID = open(savepath, "w") write(fileID, "beefblup Results Report\n") write(fileID, "Produced using beefblup (") write(fileID, "https://github.com/millironx/beefblup") write(fileID, ")\n\n") write(fileID, "Input:\t") write(fileID, path) write(fileID, "\nAnalysis performed:\t") write(fileID, string(Dates.today())) write(fileID, "\nTrait examined:\t") write(fileID, traitname) write(fileID, "\n\n") # Print base population stats write(fileID, "Base Population:\n") for i in 1:length(normal) write(fileID, "\t") write(fileID, fixedfxnames[i]) write(fileID, ":\t") write(fileID, normal[i]) write(fileID, "\n") end write(fileID, "\tMean ") write(fileID, traitname) write(fileID, ":\t") write(fileID, string(solutions[1])) write(fileID, "\n\n") # Contemporary group adjustments counter = 2 write(fileID, "Contemporary Group Effects:\n") for i in 1:length(numgroups) write(fileID, "\t") write(fileID, fixedfxnames[i]) write(fileID, "\tEffect\tReliability\n") for j in 1:numgroups[i] write(fileID, "\t") write(fileID, adjustedtraits[counter - 1]) write(fileID, "\t") write(fileID, string(solutions[counter])) write(fileID, "\t") write(fileID, string(reliability[counter])) write(fileID, "\n") counter = counter + 1 end write(fileID, "\n") end write(fileID, "\n") # Expected breeding values write(fileID, "Expected Breeding Values:\n") write(fileID, "\tID\tEBV\tReliability\n") for i in 1:numanimals write(fileID, "\t") write(fileID, string(data.id[i])) write(fileID, "\t") write(fileID, string(solutions[i + counter - 1])) write(fileID, "\t") write(fileID, string(reliability[i + counter - 1])) write(fileID, "\n") end write(fileID, "\n - END REPORT -") close(fileID) end function fixedeffectmatrix(fixedeffects::AbstractDataFrame) # Find any columns that need to be deleted for i in 1:ncol(fixedeffects) if length(unique(fixedeffects[:,i])) <= 1 @warn string("column '", names(fixedeffects)[i], "' does not have any unique animals and will be removed from this analysis") DataFrames.select!(fixedeffects, Not(i)) end end # Determine how many contemporary groups there are numtraits = ncol(fixedeffects) numgroups = ones(1, numtraits) for i in 1:numtraits numgroups[i] = length(unique(fixedeffects[:,i])) end # If there are more groups than animals, then the analysis cannot continue numanimals = length(fixedeffects[:,1]) if sum(numgroups) >= numanimals throw(ErrorException("there are more contemporary groups than animals")) end # Define a "normal" animal as one of the last in the groups, provided that # all traits do not have null values numtraits = ncol(fixedeffects) numanimals = length(fixedeffects[:,1]) normal = Array{String}(undef, 1, numtraits) for i in 1:numtraits for j in numanimals:-1:1 if !ismissing(fixedeffects[j,i]) normal[i] = string(fixedeffects[j,i]) break end end end # Form the fixed-effect matrix X = zeros(Int8, numanimals, floor(Int, sum(numgroups)) - length(numgroups) + 1) X[:,1] = ones(Int8, 1, numanimals) # Create an external counter that will increment through both loops counter = 2 # Store the traits in a string array adjustedtraits = Array{String}(undef,floor(Int, sum(numgroups)) - length(numgroups)) # Iterate through each group for i in 1:length(normal) # Find the traits that are present in this trait localdata = string.(fixedeffects[:,i]) traits = unique(localdata) # Remove the normal version from the analysis effecttraits = traits[findall(x -> x != normal[i], traits)] # Iterate inside of the group for j in 1:(length(effecttraits)) matchedindex = findall(x -> x == effecttraits[j], localdata) X[matchedindex, counter] .= 1 # Add this trait to the string adjustedtraits[counter - 1] = traits[j] # Increment the big counter counter = counter + 1 end end return X, numgroups, normal, adjustedtraits end """ additiverelationshipmatrix(id, dam, sire) Returns the additive numerator relationship matrix based on the pedigree provided in `dam` and `sire` for animals in `id`. """ function additiverelationshipmatrix(id::AbstractVector, damid::AbstractVector, sireid::AbstractVector) # Sanity-check for valid pedigree if !(length(id) == length(damid) && length(damid) == length(sireid)) throw(ArgumentError("id, dam, and sire must be of the same length")) end # Convert to positions dam = indexin(damid, id) sire = indexin(sireid, id) # Calculate loop iterations numanimals = length(dam) # Create an empty matrix for the additive relationship matrix A = zeros(numanimals, numanimals) # Create the additive relationship matrix by the FORTRAN method presented by # Henderson for i in 1:numanimals if !isnothing(dam[i]) && !isnothing(sire[i]) for j in 1:(i - 1) A[j,i] = 0.5 * (A[j,sire[i]] + A[j,dam[i]]) A[i,j] = A[j,i] end A[i,i] = 1 + 0.5 * A[sire[i], dam[i]] elseif !isnothing(dam[i]) && isnothing(sire[i]) for j in 1:(i - 1) A[j,i] = 0.5 * A[j,dam[i]] A[i,j] = A[j,i] end A[i,i] = 1 elseif isnothing(dam[i]) && !isnothing(sire[i]) for j in 1:(i - 1) A[j,i] = 0.5 * A[j,sire[i]] A[i,j] = A[j,i] end A[i,i] = 1 else for j in 1:(i - 1) A[j,i] = 0 A[i,j] = 0 end A[i,i] = 1 end end return A end """ renamecolstospec(::DataFrame) Renames the first four columns of the beefblup data sheet so that they can be referred to by name instead of by column index, regardless of user input. """ function renamecolstospec!(df::DataFrame) # Pull out the fixed-effect and observation name othernames = propertynames(df)[5:end] # Put specification column names and user-defined names together allnames = cat([:id, :birthdate, :dam, :sire], othernames, dims=1) # Rename in the DataFrame rename!(df, allnames, makeunique=true) return df end end