antimicrobial-use-survey-an.../main.jl

#!/usr/bin/env julia
using CSV
using DataFrames
using Dates
using JSON3
using Logging
using PromptingTools

# Set logging to file
log_io = open("antimicrobial-use-survey-analysis_$(now()).log", "w+")
logger = SimpleLogger(log_io)
global_logger(logger)

# Import data
survey_data = DataFrame(CSV.File("data.tsv"; delim='\t', normalizenames=true))
deleteat!(survey_data, 2)

# Set descriptions of each column based on the actual question asked
for (i, col) in enumerate(eachcol(survey_data))
    colmetadata!(survey_data, i, "description", first(col))
end #for

# Remove the messy JSON encoding
# TODO: For later graphs, move this step _before_ the import so that DataFrames can properly
# infer types
# deleteat!(survey_data, [1,2])

# Compile comments from all questions and analyze
# We will be offloading the analysis to Ollama running Llama3.1 locally
questions = [:Q8, :Q16, :Q29, :Q30]

function logged_prompt(prompt)
    @info "Prompting Llama3.1 with \n```$prompt\n```\n"
    response = aigenerate(
        PromptingTools.OllamaSchema(),
        prompt;
        model="llama3.1",
        api_kwargs=(; options=(; num_gpu=99))
    ).content
    @info "Llama3.1 responsed with \n```\n$response\n```\n"
    return response
end #function

question_dict = Dict()

for q in questions
    # Summarize the major themes among all answers
    analysis_prompt = """
    The following is a list of answers to a survey with one response per paragraph:

    # Antimicrobial usage survey open-ended question: $q

    $(
        join(
            [
                i == 1 ? "**$a**\n" : "$a\n" for (i, a) in enumerate(skipmissing(survey_data[!, q]))
            ],
            '\n'
        )
    )

    ---

    Summarize the common themes between the survey responses.
    """

    analysis_response = logged_prompt(analysis_prompt)


    # Compile all themes that Llama3 identified.
    # Llama3 tends to summarize each theme with a bolded statement.
    # We will extract the bolded statements to compile themes
    themes = String[]
    for l in eachline(IOBuffer(analysis_response))
        m = match(r"^[1-9]+\. \*\*(.+)\*\*:", l)
        isnothing(m) || push!(themes, first(m))
    end #for
    @info "Found themes $themes"

    answer_dict = Dict()

    # Now go back through each answer and check if it is addressing the theme noted
    for (i, a) in enumerate(skipmissing(survey_data[!, q]))
        i == 1 && continue #first "answer" is the question

        theme_dict = Dict{String,Union{Bool,Missing}}()

        for t in themes
            theme_prompt = """
            The following was answered as a free-response answer on a survey:

            $a

            ---

            Does this answer deal with the theme of $t? Answer yes or no.
            """

            theme_response = logged_prompt(theme_prompt)

            if startswith(lowercase(theme_response), "yes")
                theme_dict[t] = true
            elseif startswith(lowercase(theme_response), "no")
                theme_dict[t] = false
            else
                theme_dict[t] = missing
            end #if
        end #for

        answer_dict[a] = theme_dict

    end #for

    question_dict[q] = answer_dict

end #for


open("results.json", "w") do io
    JSON3.write(io, question_dict)
end

open("results.md", "w") do f
    write(f, "# Antimicrobial usage survey open-ended questions\n\n")

    for (question, answers) in question_dict
        write(f, "## $(question): $(first(survey_data[!, question]))\n\n")

        for (answer, themes) in answers
            write(f, "$answer\n\n    Themes:\n")

            for (theme, is_answered) in themes
                if ismissing(is_answered)
                    write(f, "        - $theme?\n")
                elseif is_answered
                    write(f, "        - $theme\n")
                end #if

            end #for (theme)
            write(f, "\n\n---\n\n")
        end #for (answer)
        write(f, "\n\n")
    end #for (question)
    write(f, "\n\n")
end #do

run(`pandoc results.md -o results.docx`)

# Close log file
close(log_io)
feat: Add response sorting script 2024-09-03 22:50:16 +00:00			`#!/usr/bin/env julia`
			`using CSV`
			`using DataFrames`
feat: Add logging to script 2024-09-04 20:21:32 +00:00			`using Dates`
chore: Add JSON3 2024-09-04 21:49:17 +00:00			`using JSON3`
feat: Add logging to script 2024-09-04 20:21:32 +00:00			`using Logging`
chore: Add PromptingTools 2024-09-04 20:46:28 +00:00			`using PromptingTools`
feat: Add logging to script 2024-09-04 20:21:32 +00:00
			`# Set logging to file`
			`log_io = open("antimicrobial-use-survey-analysis_$(now()).log", "w+")`
			`logger = SimpleLogger(log_io)`
			`global_logger(logger)`
feat: Add response sorting script 2024-09-03 22:50:16 +00:00
			`# Import data`
			`survey_data = DataFrame(CSV.File("data.tsv"; delim='\t', normalizenames=true))`
			`deleteat!(survey_data, 2)`

			`# Set descriptions of each column based on the actual question asked`
			`for (i, col) in enumerate(eachcol(survey_data))`
			`colmetadata!(survey_data, i, "description", first(col))`
			`end #for`

			`# Remove the messy JSON encoding`
			`# TODO: For later graphs, move this step _before_ the import so that DataFrames can properly`
			`# infer types`
			`# deleteat!(survey_data, [1,2])`

wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00			`# Compile comments from all questions and analyze`
			`# We will be offloading the analysis to Ollama running Llama3.1 locally`
feat: Add response sorting script 2024-09-03 22:50:16 +00:00			`questions = [:Q8, :Q16, :Q29, :Q30]`
wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00
refactor: Add prompt-specific logic to function 2024-09-04 21:02:16 +00:00			`function logged_prompt(prompt)`
			@info "Prompting Llama3.1 with \n```$prompt\n```\n"
			`response = aigenerate(`
			`PromptingTools.OllamaSchema(),`
			`prompt;`
			`model="llama3.1",`
			`api_kwargs=(; options=(; num_gpu=99))`
			`).content`
			@info "Llama3.1 responsed with \n```\n$response\n```\n"
			`return response`
			`end #function`

feat: Add question-by-question theme analysis 2024-09-04 22:12:54 +00:00			`question_dict = Dict()`

wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00			`for q in questions`
feat: Add question-by-question theme analysis 2024-09-04 22:12:54 +00:00			`# Summarize the major themes among all answers`
wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00			`analysis_prompt = """`
			`The following is a list of answers to a survey with one response per paragraph:`

			`# Antimicrobial usage survey open-ended question: $q`

			`$(`
			`join(`
			`[`
			`i == 1 ? "$a\n" : "$a\n" for (i, a) in enumerate(skipmissing(survey_data[!, q]))`
			`],`
			`'\n'`
			`)`
			`)`

			`---`

			`Summarize the common themes between the survey responses.`
			`"""`

refactor: Add prompt-specific logic to function 2024-09-04 21:02:16 +00:00			`analysis_response = logged_prompt(analysis_prompt)`
wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00
feat: Add question-by-question theme analysis 2024-09-04 22:12:54 +00:00
			`# Compile all themes that Llama3 identified.`
			`# Llama3 tends to summarize each theme with a bolded statement.`
			`# We will extract the bolded statements to compile themes`
			`themes = String[]`
			`for l in eachline(IOBuffer(analysis_response))`
			`m = match(r"^[1-9]+\. \\(.+)\\:", l)`
			`isnothing(m) \|\| push!(themes, first(m))`
			`end #for`
			`@info "Found themes $themes"`

			`answer_dict = Dict()`

			`# Now go back through each answer and check if it is addressing the theme noted`
			`for (i, a) in enumerate(skipmissing(survey_data[!, q]))`
			`i == 1 && continue #first "answer" is the question`

			`theme_dict = Dict{String,Union{Bool,Missing}}()`

			`for t in themes`
			`theme_prompt = """`
			`The following was answered as a free-response answer on a survey:`

			`$a`

			`---`

			`Does this answer deal with the theme of $t? Answer yes or no.`
			`"""`

			`theme_response = logged_prompt(theme_prompt)`

			`if startswith(lowercase(theme_response), "yes")`
			`theme_dict[t] = true`
			`elseif startswith(lowercase(theme_response), "no")`
			`theme_dict[t] = false`
			`else`
			`theme_dict[t] = missing`
			`end #if`
			`end #for`

			`answer_dict[a] = theme_dict`

			`end #for`

			`question_dict[q] = answer_dict`

wip: Make Ollama offloading step (broken) 2024-09-04 20:45:48 +00:00			`end #for`


feat: Add question-by-question theme analysis 2024-09-04 22:12:54 +00:00			`open("results.json", "w") do io`
			`JSON3.write(io, question_dict)`
			`end`

refactor: Compile everything into a single report file 2024-09-10 19:28:17 +00:00			`open("results.md", "w") do f`
			`write(f, "# Antimicrobial usage survey open-ended questions\n\n")`

			`for (question, answers) in question_dict`
			`write(f, "## $(question): $(first(survey_data[!, question]))\n\n")`

			`for (answer, themes) in answers`
			`write(f, "$answer\n\n Themes:\n")`

			`for (theme, is_answered) in themes`
			`if ismissing(is_answered)`
			`write(f, " - $theme?\n")`
			`elseif is_answered`
			`write(f, " - $theme\n")`
			`end #if`

			`end #for (theme)`
			`write(f, "\n\n---\n\n")`
			`end #for (answer)`
			`write(f, "\n\n")`
			`end #for (question)`
			`write(f, "\n\n")`
			`end #do`

			run(`pandoc results.md -o results.docx`)
feat: Add logging to script 2024-09-04 20:21:32 +00:00
			`# Close log file`
			`close(log_io)`