#!/usr/bin/env julia
using CSV
using DataFrames
using Dates
using JSON3
using Logging
using PromptingTools

# Set logging to file
log_io = open("antimicrobial-use-survey-analysis_$(now()).log", "w+")
logger = SimpleLogger(log_io)
global_logger(logger)

# Import data
survey_data = DataFrame(CSV.File("data.tsv"; delim='\t', normalizenames=true))
deleteat!(survey_data, 2)

# Set descriptions of each column based on the actual question asked
for (i, col) in enumerate(eachcol(survey_data))
    colmetadata!(survey_data, i, "description", first(col))
end #for

# Remove the messy JSON encoding
# TODO: For later graphs, move this step _before_ the import so that DataFrames can properly
# infer types
# deleteat!(survey_data, [1,2])

# Compile comments from all questions and analyze
# We will be offloading the analysis to Ollama running Llama3.1 locally
questions = [:Q8, :Q16, :Q29, :Q30]

function logged_prompt(prompt)
    @info "Prompting Llama3.1 with \n```$prompt\n```\n"
    response = aigenerate(
        PromptingTools.OllamaSchema(),
        prompt;
        model="llama3.1",
        api_kwargs=(; options=(; num_gpu=99))
    ).content
    @info "Llama3.1 responsed with \n```\n$response\n```\n"
    return response
end #function

question_dict = Dict()

for q in questions
    # Summarize the major themes among all answers
    analysis_prompt = """
    The following is a list of answers to a survey with one response per paragraph:

    # Antimicrobial usage survey open-ended question: $q

    $(
        join(
            [
                i == 1 ? "**$a**\n" : "$a\n" for (i, a) in enumerate(skipmissing(survey_data[!, q]))
            ],
            '\n'
        )
    )

    ---

    Summarize the common themes between the survey responses.
    """

    analysis_response = logged_prompt(analysis_prompt)


    # Compile all themes that Llama3 identified.
    # Llama3 tends to summarize each theme with a bolded statement.
    # We will extract the bolded statements to compile themes
    themes = String[]
    for l in eachline(IOBuffer(analysis_response))
        m = match(r"^[1-9]+\. \*\*(.+)\*\*:", l)
        isnothing(m) || push!(themes, first(m))
    end #for
    @info "Found themes $themes"

    answer_dict = Dict()

    # Now go back through each answer and check if it is addressing the theme noted
    for (i, a) in enumerate(skipmissing(survey_data[!, q]))
        i == 1 && continue #first "answer" is the question

        theme_dict = Dict{String,Union{Bool,Missing}}()

        for t in themes
            theme_prompt = """
            The following was answered as a free-response answer on a survey:

            $a

            ---

            Does this answer deal with the theme of $t? Answer yes or no.
            """

            theme_response = logged_prompt(theme_prompt)

            if startswith(lowercase(theme_response), "yes")
                theme_dict[t] = true
            elseif startswith(lowercase(theme_response), "no")
                theme_dict[t] = false
            else
                theme_dict[t] = missing
            end #if
        end #for

        answer_dict[a] = theme_dict

    end #for

    question_dict[q] = answer_dict

end #for


open("results.json", "w") do io
    JSON3.write(io, question_dict)
end

open("results.md", "w") do f
    write(f, "# Antimicrobial usage survey open-ended questions\n\n")

    for (question, answers) in question_dict
        write(f, "## $(question): $(first(survey_data[!, question]))\n\n")

        for (answer, themes) in answers
            write(f, "$answer\n\n    Themes:\n")

            for (theme, is_answered) in themes
                if ismissing(is_answered)
                    write(f, "        - $theme?\n")
                elseif is_answered
                    write(f, "        - $theme\n")
                end #if

            end #for (theme)
            write(f, "\n\n---\n\n")
        end #for (answer)
        write(f, "\n\n")
    end #for (question)
    write(f, "\n\n")
end #do

run(`pandoc results.md -o results.docx`)

# Close log file
close(log_io)