#!/usr/bin/env julia using CSV using DataFrames using Dates using JSON3 using Logging using PromptingTools # Set logging to file log_io = open("antimicrobial-use-survey-analysis_$(now()).log", "w+") logger = SimpleLogger(log_io) global_logger(logger) # Import data survey_data = DataFrame(CSV.File("data.tsv"; delim='\t', normalizenames=true)) deleteat!(survey_data, 2) # Set descriptions of each column based on the actual question asked for (i, col) in enumerate(eachcol(survey_data)) colmetadata!(survey_data, i, "description", first(col)) end #for # Remove the messy JSON encoding # TODO: For later graphs, move this step _before_ the import so that DataFrames can properly # infer types # deleteat!(survey_data, [1,2]) # Compile comments from all questions and analyze # We will be offloading the analysis to Ollama running Llama3.1 locally questions = [:Q8, :Q16, :Q29, :Q30] function logged_prompt(prompt) @info "Prompting Llama3.1 with \n```$prompt\n```\n" response = aigenerate( PromptingTools.OllamaSchema(), prompt; model="llama3.1", api_kwargs=(; options=(; num_gpu=99)) ).content @info "Llama3.1 responsed with \n```\n$response\n```\n" return response end #function question_dict = Dict() for q in questions # Summarize the major themes among all answers analysis_prompt = """ The following is a list of answers to a survey with one response per paragraph: # Antimicrobial usage survey open-ended question: $q $( join( [ i == 1 ? "**$a**\n" : "$a\n" for (i, a) in enumerate(skipmissing(survey_data[!, q])) ], '\n' ) ) --- Summarize the common themes between the survey responses. """ analysis_response = logged_prompt(analysis_prompt) # Compile all themes that Llama3 identified. # Llama3 tends to summarize each theme with a bolded statement. # We will extract the bolded statements to compile themes themes = String[] for l in eachline(IOBuffer(analysis_response)) m = match(r"^[1-9]+\. \*\*(.+)\*\*:", l) isnothing(m) || push!(themes, first(m)) end #for @info "Found themes $themes" answer_dict = Dict() # Now go back through each answer and check if it is addressing the theme noted for (i, a) in enumerate(skipmissing(survey_data[!, q])) i == 1 && continue #first "answer" is the question theme_dict = Dict{String,Union{Bool,Missing}}() for t in themes theme_prompt = """ The following was answered as a free-response answer on a survey: $a --- Does this answer deal with the theme of $t? Answer yes or no. """ theme_response = logged_prompt(theme_prompt) if startswith(lowercase(theme_response), "yes") theme_dict[t] = true elseif startswith(lowercase(theme_response), "no") theme_dict[t] = false else theme_dict[t] = missing end #if end #for answer_dict[a] = theme_dict end #for question_dict[q] = answer_dict end #for open("results.json", "w") do io JSON3.write(io, question_dict) end open("results.md", "w") do f write(f, "# Antimicrobial usage survey open-ended questions\n\n") for (question, answers) in question_dict write(f, "## $(question): $(first(survey_data[!, question]))\n\n") for (answer, themes) in answers write(f, "$answer\n\n Themes:\n") for (theme, is_answered) in themes if ismissing(is_answered) write(f, " - $theme?\n") elseif is_answered write(f, " - $theme\n") end #if end #for (theme) write(f, "\n\n---\n\n") end #for (answer) write(f, "\n\n") end #for (question) write(f, "\n\n") end #do run(`pandoc results.md -o results.docx`) # Close log file close(log_io)