antimicrobial-use-survey-an.../main.jl

151 lines
4 KiB
Julia
Raw Normal View History

2024-09-03 22:50:16 +00:00
#!/usr/bin/env julia
using CSV
using DataFrames
2024-09-04 20:21:32 +00:00
using Dates
2024-09-04 21:49:17 +00:00
using JSON3
2024-09-04 20:21:32 +00:00
using Logging
2024-09-04 20:46:28 +00:00
using PromptingTools
2024-09-04 20:21:32 +00:00
# Set logging to file
log_io = open("antimicrobial-use-survey-analysis_$(now()).log", "w+")
logger = SimpleLogger(log_io)
global_logger(logger)
2024-09-03 22:50:16 +00:00
# Import data
survey_data = DataFrame(CSV.File("data.tsv"; delim='\t', normalizenames=true))
deleteat!(survey_data, 2)
# Set descriptions of each column based on the actual question asked
for (i, col) in enumerate(eachcol(survey_data))
colmetadata!(survey_data, i, "description", first(col))
end #for
# Remove the messy JSON encoding
# TODO: For later graphs, move this step _before_ the import so that DataFrames can properly
# infer types
# deleteat!(survey_data, [1,2])
# Compile comments from all questions and analyze
# We will be offloading the analysis to Ollama running Llama3.1 locally
2024-09-03 22:50:16 +00:00
questions = [:Q8, :Q16, :Q29, :Q30]
function logged_prompt(prompt)
@info "Prompting Llama3.1 with \n```$prompt\n```\n"
response = aigenerate(
PromptingTools.OllamaSchema(),
prompt;
model="llama3.1",
api_kwargs=(; options=(; num_gpu=99))
).content
@info "Llama3.1 responsed with \n```\n$response\n```\n"
return response
end #function
question_dict = Dict()
for q in questions
# Summarize the major themes among all answers
analysis_prompt = """
The following is a list of answers to a survey with one response per paragraph:
# Antimicrobial usage survey open-ended question: $q
$(
join(
[
i == 1 ? "**$a**\n" : "$a\n" for (i, a) in enumerate(skipmissing(survey_data[!, q]))
],
'\n'
)
)
---
Summarize the common themes between the survey responses.
"""
analysis_response = logged_prompt(analysis_prompt)
# Compile all themes that Llama3 identified.
# Llama3 tends to summarize each theme with a bolded statement.
# We will extract the bolded statements to compile themes
themes = String[]
for l in eachline(IOBuffer(analysis_response))
m = match(r"^[1-9]+\. \*\*(.+)\*\*:", l)
isnothing(m) || push!(themes, first(m))
end #for
@info "Found themes $themes"
answer_dict = Dict()
# Now go back through each answer and check if it is addressing the theme noted
for (i, a) in enumerate(skipmissing(survey_data[!, q]))
i == 1 && continue #first "answer" is the question
theme_dict = Dict{String,Union{Bool,Missing}}()
for t in themes
theme_prompt = """
The following was answered as a free-response answer on a survey:
$a
---
Does this answer deal with the theme of $t? Answer yes or no.
"""
theme_response = logged_prompt(theme_prompt)
if startswith(lowercase(theme_response), "yes")
theme_dict[t] = true
elseif startswith(lowercase(theme_response), "no")
theme_dict[t] = false
else
theme_dict[t] = missing
end #if
end #for
answer_dict[a] = theme_dict
end #for
question_dict[q] = answer_dict
end #for
open("results.json", "w") do io
JSON3.write(io, question_dict)
end
open("results.md", "w") do f
write(f, "# Antimicrobial usage survey open-ended questions\n\n")
for (question, answers) in question_dict
write(f, "## $(question): $(first(survey_data[!, question]))\n\n")
for (answer, themes) in answers
write(f, "$answer\n\n Themes:\n")
for (theme, is_answered) in themes
if ismissing(is_answered)
write(f, " - $theme?\n")
elseif is_answered
write(f, " - $theme\n")
end #if
end #for (theme)
write(f, "\n\n---\n\n")
end #for (answer)
write(f, "\n\n")
end #for (question)
write(f, "\n\n")
end #do
run(`pandoc results.md -o results.docx`)
2024-09-04 20:21:32 +00:00
# Close log file
close(log_io)