beefblup/src/BeefBLUP.jl

# beefblup
# Julia package for performing single-variate BLUP to find beef cattle
# breeding values
# (C) 2021 Thomas A. Christensen II
# Licensed under BSD-3-Clause License
# cSpell:includeRegExp #.*
# cSpell:includeRegExp ("""|''')[^\1]*\1

module BeefBLUP

# Import the required packages
using CSV
using DataFrames
using LinearAlgebra
using Dates
using Gtk

# Main entry-level function - acts just like the script
function beefblup()

    # Ask for an input spreadsheet
    path = open_dialog_native(
    "Select a beefblup worksheet",
    GtkNullContainer(),
    ("*.csv", GtkFileFilter("*.csv", name="beefblup worksheet"))
    )

    # Ask for an output text filename
    savepath = save_dialog_native(
    "Save your beefblup results",
    GtkNullContainer(),
    (GtkFileFilter("*.txt", name="Results file"),
    "*.txt")
    )

    # Ask for heritability
    print("What is the heritability for this trait?> ")
    h2 = parse(Float64, readline(stdin))

    beefblup(path, savepath, h2)

end

function beefblup(datafile::String, h2::Float64)
    # Assume the data is named the same as the file without the trailing extension
    dataname = join(split(datafile, ".")[1:end - 1])

    # Create a new results name
    resultsfile = string(dataname, "_results.txt")

    # Pass this info on to the worker
    beefblup(datafile, resultsfile, h2)
end

# Main worker function, can perform all the work if given all the user input
function beefblup(path::String, savepath::String, h2::Float64)

    # Import data from a suitable spreadsheet
    data = DataFrame(CSV.File(path))

    # Make sure the data is in the proper format
    renamecolstospec!(data)

    # Sort the array by date
    sort!(data, :birthdate)

    # Define fields to hold id values for animals and their parents
    numanimals = length(data.id)

    # Calculate the relationship matrix
    A = additiverelationshipmatrix(data.id, data.dam, data.sire)

    # Extract all of the fixed effects
    fixedfx = select(data, Not([:id, :birthdate, :sire, :dam]))[:,1:end - 1]

    (X, numgroups, normal, adjustedtraits) = fixedeffectmatrix(fixedfx)

    # Extract the observed data
    Y = convert(Array{Float64}, data[:,end])

    # The random effects matrix
    Z = Matrix{Int}(I, numanimals, numanimals)

    # Remove items where there is no data
    nullobs = findall(isnothing, Y)
    Z[nullobs, nullobs] .= 0

    # Calculate heritability
    λ = (1 - h2) / h2

    # Use the mixed-model equations
    MME = [X' * X X' * Z; Z' * X (Z' * Z) + (inv(A) .* λ)]
    MMY = [X' * Y; Z' * Y]
    solutions = MME \ MMY

    # Find the accuracies
    diaginv = diag(inv(MME))
    reliability = ones(Float64, length(diaginv)) - diaginv .* λ

    # Find how many traits we found BLUE for
    numgroups = numgroups .- 1

    # Extract the names of the traits
    fixedfxnames = names(fixedfx)
    traitname = names(data)[end]

    # Start printing results to output
    fileID = open(savepath, "w")
    write(fileID, "beefblup Results Report\n")
    write(fileID, "Produced using beefblup (")
    write(fileID, "https://github.com/millironx/beefblup")
    write(fileID, ")\n\n")
    write(fileID, "Input:\t")
    write(fileID, path)
    write(fileID, "\nAnalysis performed:\t")
    write(fileID, string(Dates.today()))
    write(fileID, "\nTrait examined:\t")
    write(fileID, traitname)
    write(fileID, "\n\n")

    # Print base population stats
    write(fileID, "Base Population:\n")
    for i in 1:length(normal)
        write(fileID, "\t")
        write(fileID, fixedfxnames[i])
        write(fileID, ":\t")
        write(fileID, normal[i])
        write(fileID, "\n")
    end
    write(fileID, "\tMean ")
    write(fileID, traitname)
    write(fileID, ":\t")
    write(fileID, string(solutions[1]))
    write(fileID, "\n\n")

    # Contemporary group adjustments
    counter = 2
    write(fileID, "Contemporary Group Effects:\n")
    for i in 1:length(numgroups)
        write(fileID, "\t")
        write(fileID, fixedfxnames[i])
        write(fileID, "\tEffect\tReliability\n")
        for j in 1:numgroups[i]
            write(fileID, "\t")
            write(fileID, adjustedtraits[counter - 1])
            write(fileID, "\t")
            write(fileID, string(solutions[counter]))
            write(fileID, "\t")
            write(fileID, string(reliability[counter]))
            write(fileID, "\n")

            counter = counter + 1
        end
        write(fileID, "\n")
    end
    write(fileID, "\n")

    # Expected breeding values
    write(fileID, "Expected Breeding Values:\n")
    write(fileID, "\tID\tEBV\tReliability\n")
    for i in 1:numanimals
        write(fileID, "\t")
        write(fileID, string(data.id[i]))
        write(fileID, "\t")
        write(fileID, string(solutions[i + counter - 1]))
        write(fileID, "\t")
        write(fileID, string(reliability[i + counter - 1]))
        write(fileID, "\n")
    end

    write(fileID, "\n - END REPORT -")
    close(fileID)

end

function fixedeffectmatrix(fixedeffects::AbstractDataFrame)
    # Find any columns that need to be deleted
    for i in 1:ncol(fixedeffects)
        if length(unique(fixedeffects[:,i])) <= 1
            @warn string("column '", names(fixedeffects)[i], "' does not have any unique animals and will be removed from this analysis")
            DataFrames.select!(fixedeffects, Not(i))
        end
    end

    # Determine how many contemporary groups there are
    numtraits = ncol(fixedeffects)
    numgroups = ones(1, numtraits)
    for i in 1:numtraits
        numgroups[i] = length(unique(fixedeffects[:,i]))
    end

    # If there are more groups than animals, then the analysis cannot continue
    numanimals = length(fixedeffects[:,1])
    if sum(numgroups) >= numanimals
        throw(ErrorException("there are more contemporary groups than animals"))
    end

    # Define a "normal" animal as one of the last in the groups, provided that
    # all traits do not have null values
    numtraits = ncol(fixedeffects)
    numanimals = length(fixedeffects[:,1])
    normal = Array{String}(undef, 1, numtraits)
    for i in 1:numtraits
        for j in numanimals:-1:1
            if !ismissing(fixedeffects[j,i])
                normal[i] = string(fixedeffects[j,i])
                break
            end
        end
    end

    # Form the fixed-effect matrix
    X = zeros(Int8, numanimals, floor(Int, sum(numgroups)) - length(numgroups) + 1)
    X[:,1] = ones(Int8, 1, numanimals)

    # Create an external counter that will increment through both loops
    counter = 2

    # Store the traits in a string array
    adjustedtraits =
    Array{String}(undef,floor(Int, sum(numgroups)) - length(numgroups))
    # Iterate through each group
    for i in 1:length(normal)
        # Find the traits that are present in this trait
        localdata = string.(fixedeffects[:,i])
        traits = unique(localdata)
        # Remove the normal version from the analysis
        effecttraits = traits[findall(x -> x != normal[i], traits)]
        # Iterate inside of the group
        for j in 1:(length(effecttraits))
            matchedindex = findall(x -> x == effecttraits[j], localdata)
            X[matchedindex, counter] .= 1
            # Add this trait to the string
            adjustedtraits[counter - 1] = traits[j]
            # Increment the big counter
            counter = counter + 1
        end
    end

    return X, numgroups, normal, adjustedtraits
end

"""
    additiverelationshipmatrix(id, dam, sire)

Returns the additive numerator relationship matrix based on the pedigree provided in `dam`
and `sire` for animals in `id`.

"""
function additiverelationshipmatrix(id::AbstractVector, damid::AbstractVector, sireid::AbstractVector)
    # Sanity-check for valid pedigree
    if !(length(id) == length(damid) && length(damid) == length(sireid))
        throw(ArgumentError("id, dam, and sire must be of the same length"))
    end

    # Convert to positions
    dam = indexin(damid, id)
    sire = indexin(sireid, id)

    # Calculate loop iterations
    numanimals = length(dam)

    # Create an empty matrix for the additive relationship matrix
    A = zeros(numanimals, numanimals)

    # Create the additive relationship matrix by the FORTRAN method presented by
    # Henderson
    for i in 1:numanimals
        if !isnothing(dam[i]) && !isnothing(sire[i])
            for j in 1:(i - 1)
                A[j,i] = 0.5 * (A[j,sire[i]] + A[j,dam[i]])
                A[i,j] = A[j,i]
            end
            A[i,i] = 1 + 0.5 * A[sire[i], dam[i]]
        elseif !isnothing(dam[i]) && isnothing(sire[i])
        for j in 1:(i - 1)
            A[j,i] = 0.5 * A[j,dam[i]]
            A[i,j] = A[j,i]
        end
        A[i,i] = 1
    elseif isnothing(dam[i]) && !isnothing(sire[i])
        for j in 1:(i - 1)
            A[j,i] = 0.5 * A[j,sire[i]]
            A[i,j] = A[j,i]
        end
        A[i,i] = 1
    else
        for j in 1:(i - 1)
            A[j,i] = 0
            A[i,j] = 0
        end
        A[i,i] = 1
        end
    end

    return A
end

"""
    renamecolstospec(::DataFrame)

Renames the first four columns of the beefblup data sheet so that they can be referred to by
name instead of by column index, regardless of user input.
"""
function renamecolstospec!(df::DataFrame)
    # Pull out the fixed-effect and observation name
    othernames = propertynames(df)[5:end]

    # Put specification column names and user-defined names together
    allnames = cat([:id, :birthdate, :dam, :sire], othernames, dims=1)

    # Rename in the DataFrame
    rename!(df, allnames, makeunique=true)
    return df
end


end
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`# beefblup`
Change main script to module and function call 2021-06-18 18:51:26 +00:00			`# Julia package for performing single-variate BLUP to find beef cattle`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`# breeding values`
Change copyright/release numbers 2021-06-18 18:26:35 +00:00			`# (C) 2021 Thomas A. Christensen II`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`# Licensed under BSD-3-Clause License`
			`# cSpell:includeRegExp #.*`
			`# cSpell:includeRegExp ("""\|''')[^\1]*\1`

Change main script to module and function call 2021-06-18 18:51:26 +00:00			`module BeefBLUP`

Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`# Import the required packages`
			`using CSV`
			`using DataFrames`
			`using LinearAlgebra`
			`using Dates`
			`using Gtk`

Change main script to module and function call 2021-06-18 18:51:26 +00:00			`# Main entry-level function - acts just like the script`
			`function beefblup()`

Reformat document 2021-06-19 23:27:52 +00:00			`# Ask for an input spreadsheet`
			`path = open_dialog_native(`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`"Select a beefblup worksheet",`
			`GtkNullContainer(),`
			`(".csv", GtkFileFilter(".csv", name="beefblup worksheet"))`
Reformat document 2021-06-19 23:27:52 +00:00			`)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`# Ask for an output text filename`
			`savepath = save_dialog_native(`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`"Save your beefblup results",`
			`GtkNullContainer(),`
			`(GtkFileFilter("*.txt", name="Results file"),`
			`"*.txt")`
Reformat document 2021-06-19 23:27:52 +00:00			`)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`# Ask for heritability`
			`print("What is the heritability for this trait?> ")`
			`h2 = parse(Float64, readline(stdin))`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`beefblup(path, savepath, h2)`
Split into input and worker functions 2021-06-18 19:13:28 +00:00
			`end`

Add results filename assumption 2021-06-18 20:25:34 +00:00			`function beefblup(datafile::String, h2::Float64)`
			`# Assume the data is named the same as the file without the trailing extension`
Reformat document 2021-06-19 23:27:52 +00:00			`dataname = join(split(datafile, ".")[1:end - 1])`
Add results filename assumption 2021-06-18 20:25:34 +00:00
			`# Create a new results name`
			`resultsfile = string(dataname, "_results.txt")`

			`# Pass this info on to the worker`
			`beefblup(datafile, resultsfile, h2)`
			`end`

Split into input and worker functions 2021-06-18 19:13:28 +00:00			`# Main worker function, can perform all the work if given all the user input`
			`function beefblup(path::String, savepath::String, h2::Float64)`

Reformat document 2021-06-19 23:27:52 +00:00			`# Import data from a suitable spreadsheet`
			`data = DataFrame(CSV.File(path))`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Make column names more robust 2021-08-31 23:27:16 +00:00			`# Make sure the data is in the proper format`
			`renamecolstospec!(data)`

Reformat document 2021-06-19 23:27:52 +00:00			`# Sort the array by date`
			`sort!(data, :birthdate)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`# Define fields to hold id values for animals and their parents`
			`numanimals = length(data.id)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Push pedigree matrix to its own function 2021-08-28 23:07:22 +00:00			`# Calculate the relationship matrix`
			`A = additiverelationshipmatrix(data.id, data.dam, data.sire)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`# Extract all of the fixed effects`
			`fixedfx = select(data, Not([:id, :birthdate, :sire, :dam]))[:,1:end - 1]`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Refactor fixed effect solver into its own function 2021-06-20 00:56:32 +00:00			`(X, numgroups, normal, adjustedtraits) = fixedeffectmatrix(fixedfx)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00
Reformat document 2021-06-19 23:27:52 +00:00			`# Extract the observed data`
			`Y = convert(Array{Float64}, data[:,end])`

			`# The random effects matrix`
			`Z = Matrix{Int}(I, numanimals, numanimals)`

			`# Remove items where there is no data`
			`nullobs = findall(isnothing, Y)`
			`Z[nullobs, nullobs] .= 0`

			`# Calculate heritability`
			`λ = (1 - h2) / h2`

			`# Use the mixed-model equations`
			`MME = [X' * X X' * Z; Z' * X (Z' * Z) + (inv(A) .* λ)]`
			`MMY = [X' * Y; Z' * Y]`
			`solutions = MME \ MMY`

			`# Find the accuracies`
			`diaginv = diag(inv(MME))`
			`reliability = ones(Float64, length(diaginv)) - diaginv .* λ`

			`# Find how many traits we found BLUE for`
			`numgroups = numgroups .- 1`

			`# Extract the names of the traits`
			`fixedfxnames = names(fixedfx)`
			`traitname = names(data)[end]`

			`# Start printing results to output`
			`fileID = open(savepath, "w")`
			`write(fileID, "beefblup Results Report\n")`
			`write(fileID, "Produced using beefblup (")`
			`write(fileID, "https://github.com/millironx/beefblup")`
			`write(fileID, ")\n\n")`
			`write(fileID, "Input:\t")`
			`write(fileID, path)`
			`write(fileID, "\nAnalysis performed:\t")`
			`write(fileID, string(Dates.today()))`
			`write(fileID, "\nTrait examined:\t")`
			`write(fileID, traitname)`
			`write(fileID, "\n\n")`

			`# Print base population stats`
			`write(fileID, "Base Population:\n")`
			`for i in 1:length(normal)`
			`write(fileID, "\t")`
			`write(fileID, fixedfxnames[i])`
			`write(fileID, ":\t")`
			`write(fileID, normal[i])`
			`write(fileID, "\n")`
			`end`
			`write(fileID, "\tMean ")`
			`write(fileID, traitname)`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, ":\t")`
Reformat document 2021-06-19 23:27:52 +00:00			`write(fileID, string(solutions[1]))`
			`write(fileID, "\n\n")`

			`# Contemporary group adjustments`
			`counter = 2`
			`write(fileID, "Contemporary Group Effects:\n")`
			`for i in 1:length(numgroups)`
			`write(fileID, "\t")`
			`write(fileID, fixedfxnames[i])`
			`write(fileID, "\tEffect\tReliability\n")`
			`for j in 1:numgroups[i]`
			`write(fileID, "\t")`
			`write(fileID, adjustedtraits[counter - 1])`
			`write(fileID, "\t")`
			`write(fileID, string(solutions[counter]))`
			`write(fileID, "\t")`
			`write(fileID, string(reliability[counter]))`
			`write(fileID, "\n")`

			`counter = counter + 1`
			`end`
			`write(fileID, "\n")`
			`end`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, "\n")`
Reformat document 2021-06-19 23:27:52 +00:00
			`# Expected breeding values`
			`write(fileID, "Expected Breeding Values:\n")`
			`write(fileID, "\tID\tEBV\tReliability\n")`
			`for i in 1:numanimals`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, "\t")`
Reformat document 2021-06-19 23:27:52 +00:00			`write(fileID, string(data.id[i]))`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, "\t")`
Reformat document 2021-06-19 23:27:52 +00:00			`write(fileID, string(solutions[i + counter - 1]))`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, "\t")`
Reformat document 2021-06-19 23:27:52 +00:00			`write(fileID, string(reliability[i + counter - 1]))`
Switch to UNIX line endings 2021-06-18 17:07:58 +00:00			`write(fileID, "\n")`
			`end`

Reformat document 2021-06-19 23:27:52 +00:00			`write(fileID, "\n - END REPORT -")`
			`close(fileID)`
Change main script to module and function call 2021-06-18 18:51:26 +00:00
			`end`
Refactor fixed effect solver into its own function 2021-06-20 00:56:32 +00:00
			`function fixedeffectmatrix(fixedeffects::AbstractDataFrame)`
			`# Find any columns that need to be deleted`
			`for i in 1:ncol(fixedeffects)`
			`if length(unique(fixedeffects[:,i])) <= 1`
			`@warn string("column '", names(fixedeffects)[i], "' does not have any unique animals and will be removed from this analysis")`
			`DataFrames.select!(fixedeffects, Not(i))`
			`end`
			`end`

			`# Determine how many contemporary groups there are`
			`numtraits = ncol(fixedeffects)`
			`numgroups = ones(1, numtraits)`
			`for i in 1:numtraits`
			`numgroups[i] = length(unique(fixedeffects[:,i]))`
			`end`

			`# If there are more groups than animals, then the analysis cannot continue`
			`numanimals = length(fixedeffects[:,1])`
			`if sum(numgroups) >= numanimals`
			`throw(ErrorException("there are more contemporary groups than animals"))`
			`end`

			`# Define a "normal" animal as one of the last in the groups, provided that`
			`# all traits do not have null values`
			`numtraits = ncol(fixedeffects)`
			`numanimals = length(fixedeffects[:,1])`
			`normal = Array{String}(undef, 1, numtraits)`
			`for i in 1:numtraits`
			`for j in numanimals:-1:1`
			`if !ismissing(fixedeffects[j,i])`
			`normal[i] = string(fixedeffects[j,i])`
			`break`
			`end`
			`end`
			`end`

			`# Form the fixed-effect matrix`
			`X = zeros(Int8, numanimals, floor(Int, sum(numgroups)) - length(numgroups) + 1)`
			`X[:,1] = ones(Int8, 1, numanimals)`

			`# Create an external counter that will increment through both loops`
			`counter = 2`

			`# Store the traits in a string array`
			`adjustedtraits =`
			`Array{String}(undef,floor(Int, sum(numgroups)) - length(numgroups))`
			`# Iterate through each group`
			`for i in 1:length(normal)`
			`# Find the traits that are present in this trait`
			`localdata = string.(fixedeffects[:,i])`
			`traits = unique(localdata)`
			`# Remove the normal version from the analysis`
			`effecttraits = traits[findall(x -> x != normal[i], traits)]`
			`# Iterate inside of the group`
			`for j in 1:(length(effecttraits))`
			`matchedindex = findall(x -> x == effecttraits[j], localdata)`
			`X[matchedindex, counter] .= 1`
			`# Add this trait to the string`
			`adjustedtraits[counter - 1] = traits[j]`
			`# Increment the big counter`
			`counter = counter + 1`
			`end`
			`end`

			`return X, numgroups, normal, adjustedtraits`
			`end`

Push pedigree matrix to its own function 2021-08-28 23:07:22 +00:00			`"""`
			`additiverelationshipmatrix(id, dam, sire)`

			Returns the additive numerator relationship matrix based on the pedigree provided in `dam`
			and `sire` for animals in `id`.

			`"""`
			`function additiverelationshipmatrix(id::AbstractVector, damid::AbstractVector, sireid::AbstractVector)`
			`# Sanity-check for valid pedigree`
			`if !(length(id) == length(damid) && length(damid) == length(sireid))`
			`throw(ArgumentError("id, dam, and sire must be of the same length"))`
			`end`

			`# Convert to positions`
			`dam = indexin(damid, id)`
			`sire = indexin(sireid, id)`

			`# Calculate loop iterations`
			`numanimals = length(dam)`

			`# Create an empty matrix for the additive relationship matrix`
			`A = zeros(numanimals, numanimals)`

			`# Create the additive relationship matrix by the FORTRAN method presented by`
			`# Henderson`
			`for i in 1:numanimals`
			`if !isnothing(dam[i]) && !isnothing(sire[i])`
			`for j in 1:(i - 1)`
			`A[j,i] = 0.5 * (A[j,sire[i]] + A[j,dam[i]])`
			`A[i,j] = A[j,i]`
			`end`
			`A[i,i] = 1 + 0.5 * A[sire[i], dam[i]]`
			`elseif !isnothing(dam[i]) && isnothing(sire[i])`
			`for j in 1:(i - 1)`
			`A[j,i] = 0.5 * A[j,dam[i]]`
			`A[i,j] = A[j,i]`
			`end`
			`A[i,i] = 1`
			`elseif isnothing(dam[i]) && !isnothing(sire[i])`
			`for j in 1:(i - 1)`
			`A[j,i] = 0.5 * A[j,sire[i]]`
			`A[i,j] = A[j,i]`
			`end`
			`A[i,i] = 1`
			`else`
			`for j in 1:(i - 1)`
			`A[j,i] = 0`
			`A[i,j] = 0`
			`end`
			`A[i,i] = 1`
			`end`
			`end`

			`return A`
			`end`

Make column names more robust 2021-08-31 23:27:16 +00:00			`"""`
			`renamecolstospec(::DataFrame)`

			`Renames the first four columns of the beefblup data sheet so that they can be referred to by`
			`name instead of by column index, regardless of user input.`
			`"""`
			`function renamecolstospec!(df::DataFrame)`
			`# Pull out the fixed-effect and observation name`
			`othernames = propertynames(df)[5:end]`

			`# Put specification column names and user-defined names together`
			`allnames = cat([:id, :birthdate, :dam, :sire], othernames, dims=1)`

			`# Rename in the DataFrame`
			`rename!(df, allnames, makeunique=true)`
			`return df`
			`end`

Push pedigree matrix to its own function 2021-08-28 23:07:22 +00:00
Change main script to module and function call 2021-06-18 18:51:26 +00:00			`end`