Introduction

In this analysis I load three example .csv file inputs titled “Person”, “Encounter”, and “Attribution” into a SQL database, simulating care and intake across 17 health centers. This is a work in progress and I plan to add more graphs in the future, including an updated interactive network analysis graph with physics simulation using the visNetwork() package. R can be used to create JavaScript objects that can be served to Business Intelligence platforms and internal data dashboards.


Building a SQL Database

To build the database with the example .csv files I created a SQLite database with the DBI() package and RStudio. I then used RMarkdown’s inline SQL engine and the kable() package to implement and display SQLite database queries in neatly formatted PDF LaTex tables. I then saved the file outputs as CSV files.


Load required R packages

library(readr) # Read file inputs
library(DBI) # Database connection
library(kableExtra) # Table formatting
library(ggplot2) # Graphing
library(plotly) # Make interactive graphs
library(magrittr) # Pipes %>%
library(dplyr) # Data manipulation
library(ggraph) # More graphing
library(igraph) # Even more graphing

Read both .csv tables into R with R tidyverse functions.

Attribution <- read_delim("data/AttributionData.csv", delim = "|")
Encounter <- read_delim("data/EncounterData.csv", delim = "|")
Person <- read_delim("data/PersonData.csv", delim = "|", skip_empty_rows = TRUE)
Person <- Person[-(nrow(Person)),]

Create a new SQLite database file path

network_db_file <- "data/network_db.sqlite"

Make a SQLite database at the file path location ‘conversations_db_file’

con <- dbConnect(RSQLite::SQLite(), network_db_file)

Write the ‘Attribution’ table to the database

dbWriteTable(con, "Attribution", Attribution[,], overwrite = TRUE, row.names = FALSE)
Attribution
person_id client_DOB Medicaid_Begin Medicaid_End AttributionDate PrevAttrib orig_site_id
10EDA3E5-E0AD-4BF3-A8C3-5477A9184B7F Sep 5 1966 12:00AM 20170201 22991231 20190301 No - New to PCMP 15
46D8B92E-870E-49BF-9063-C5609A72702A Mar 15 1988 12:00AM 20150801 22991231 20190301 Yes - Continuous 15
24C40ACB-C764-4EB1-B5AB-EC154906B1CB May 13 1985 12:00AM NULL NULL 20190301 Yes - Continuous 15
60A2FD9B-05FD-43ED-B1BC-2C910455C254 Mar 19 1957 12:00AM 20151101 22991231 20190301 Yes - Continuous 13
F3C34F73-7975-4335-B73D-1F3ACF5FE788 May 4 1962 12:00AM 20160301 22991231 20190301 Yes - Continuous 15

Write the ‘Encounter’ table to the database

dbWriteTable(con, "Encounter", Encounter[,], overwrite = TRUE, row.names = FALSE)
Encounter
enc_number enc_timestamp enc_billable_ind enc_orig_site_id enc_qualifying_ind enc_service_line_unscrubbed enc_pat_drvs_number client_service_line_id service_line_mapped
3544519 2015-10-20 19:12:32 N 2 Y Case Management 168775 325 Behavioral Health
3544709 2015-10-20 21:49:43 N 1 N MD_Family Practitioner 29116 63 Primary Care
3545122 1900-10-19 22:56:11 N 1 N LPC_Mental Health Practitioner 51591 62 Behavioral Health
3545157 2015-10-20 20:54:41 N 2 N Other 135474 330 Specialty
3545517 2015-10-19 10:31:33 N 1 N UDS_Education Specialists 59088 73 Behavioral Health
3545559 2015-10-18 21:47:00 N 1 N _ 31203 38 Ignore

Write the ‘Person’ table to the database

dbWriteTable(con, "Person", Person[,], overwrite = TRUE, row.names = FALSE)
Person
person_id pat_drvs_number pat_date_of_birth pat_homelessness_status pat_migrant_status pat_veteran_status pat_create_timestamp pat_modify_timestamp pat_orig_site_id
637AA380-3E1E-4F10-A4E8-EE96F6979C95 90 1990-02-05 Not Homeless Not A Farm Worker N 2010-04-26 13:58:43 2015-05-28 11:50:06 1
8F63BF05-E0A1-4617-BE5E-6D0D0EF9795C 1609 1990-03-24 Not Homeless Not A Farm Worker N 2011-12-19 10:19:12 2015-11-30 12:12:46 1
4362171F-92B6-4D6B-87F7-880FD2DA67DA 4682 1972-07-05 Not Homeless Not A Farm Worker N 2011-01-28 11:06:40 2015-06-25 13:52:37 1
661B4E44-3B6E-49F1-9A49-7A217ADEC658 8451 1992-03-18 Not Homeless Not A Farm Worker N 2012-09-11 08:14:40 2019-04-16 10:13:20 1
A8A108BB-6731-4B81-93AF-1F4F7C3D9923 10383 1991-05-01 NULL NULL N 2010-04-26 13:58:43 2010-04-26 12:01:40 1
9C234B3E-3B87-4C8A-AF26-1C9E265D4E25 15559 1990-07-03 Not Homeless Not A Farm Worker N 2011-07-26 09:27:46 2019-04-16 10:24:39 1

Performing Data Analysis

1. Identify the most recent encounter for each patient, by health center and specialty.

WITH enc_ranked AS(
  SELECT 
    p.person_id, 
    p.pat_drvs_number,
    e.enc_pat_drvs_number,
    e.enc_number,
    e.enc_service_line_unscrubbed,
    e.service_line_mapped,
    e.enc_orig_site_id, 
    datetime(e.enc_timestamp, 'unixepoch') as enc_timestamp,
    ROW_NUMBER() OVER(PARTITION BY p.person_id ORDER BY enc_timestamp DESC) AS rank
  FROM
    Encounter e
  INNER JOIN Person p
    on e.enc_pat_drvs_number = p.pat_drvs_number)
SELECT 
  person_id, 
  enc_number, 
  enc_timestamp, 
  enc_orig_site_id, 
  enc_service_line_unscrubbed, 
  service_line_mapped
  FROM 
    enc_ranked 
  WHERE rank = 1

Table 1:

The first 5 rows of Table 1 are shown
person_id enc_number enc_timestamp enc_orig_site_id enc_service_line_unscrubbed service_line_mapped
00225091-B854-4B6D-8520-308C20E0FE2E 32354540 2019-07-23 13:40:00 11 Family Medicine Primary Care
006589B1-A5F7-493D-A86A-C5F5A6FC3767 31211784 2019-05-13 13:20:00 11 Family Medicine Primary Care
008394B7-47A5-4206-A641-D7116DB12B3B 32579131 2019-08-05 09:30:00 11 Dental General Practice Dental
00BA341D-528C-465F-8C59-113DE481CBD1 28327946 2018-12-07 10:20:00 11 Family Medicine Primary Care
015185D9-24F6-415C-860F-A7E5DA5DBC23 32528909 2019-08-01 09:20:00 11 Family Medicine Primary Care

1a. Plot the results in Table 1 with an interactive stacked bar chart

# Summarize data by health center and specialty
q1_summary <- q1_output %>%
  group_by(enc_orig_site_id, service_line_mapped) %>%
  summarize(n = n()) %>%
  ungroup()

# Create stacked bar chart
p <- ggplot(q1_summary, aes(x = enc_orig_site_id, y = n, fill = service_line_mapped)) +
  geom_col(position = "stack") +
  scale_fill_discrete() +
  scale_x_discrete(labels = seq_along(unique(q1_summary$enc_orig_site_id))) +
  labs(x = "Health Center", y = "Count", fill = "Specialty") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

ggplotly(p)

2. Identify patients who have been seen at more than one health center

WITH enc_site AS(
  SELECT 
    p.person_id, 
    p.pat_drvs_number,
    e.enc_pat_drvs_number,
    e.enc_orig_site_id
  FROM
    Encounter e
  INNER JOIN Person p
    ON e.enc_pat_drvs_number = p.pat_drvs_number)
SELECT person_id, COUNT(DISTINCT(enc_orig_site_id)) AS centers_visited
  FROM 
    enc_site
  GROUP BY person_id
  HAVING centers_visited > 1

Table 2:

The first 6 rows of Table 2 are shown
person_id centers_visited
00225091-B854-4B6D-8520-308C20E0FE2E 2
006589B1-A5F7-493D-A86A-C5F5A6FC3767 3
015185D9-24F6-415C-860F-A7E5DA5DBC23 3
01A964D5-CF94-4C57-B13C-17912E50CBBC 2
01D8C426-1C75-449D-A9BB-291BB67BD897 2
021F6E0B-D736-44CB-BFD9-7CDDEC3ECC25 2

3. Use a network analysis graph to determine which groups of health centers have the most patient visits.

# Query data from database to find edges in network
edges <- dbGetQuery(con, "WITH enc_site AS(
  SELECT 
    e.enc_pat_drvs_number,
    e.enc_orig_site_id
  FROM Encounter e)
SELECT e1.enc_orig_site_id AS site1, e2.enc_orig_site_id AS site2, COUNT(DISTINCT e1.enc_pat_drvs_number) AS weight
  FROM enc_site e1
  JOIN enc_site e2 ON e1.enc_pat_drvs_number = e2.enc_pat_drvs_number AND e1.enc_orig_site_id < e2.enc_orig_site_id
  GROUP BY e1.enc_orig_site_id, e2.enc_orig_site_id")

# Create igraph object
graph <- graph_from_data_frame(edges, directed = FALSE)

# Add node attribute for health center names
health_centers <- dbGetQuery(con, "SELECT enc_orig_site_id as name FROM Encounter")
V(graph)$name <- health_centers[match(V(graph)$name, health_centers$name), "name"]

# Convert igraph object to ggplot object using ggraph
g <- ggraph(graph, layout = "kk") +
  geom_edge_link(aes(width = weight)) +
  geom_node_point(linewidth = 5, color = "blue") +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

# Plot the graph
g


4. Determine if patients are currently attributed to the health center they have most recently visited.

WITH enc_ranked AS(
  SELECT 
    p.person_id, 
    p.pat_drvs_number,
    e.enc_pat_drvs_number,
    e.enc_number,
    e.enc_orig_site_id, 
    datetime(e.enc_timestamp, 'unixepoch') as enc_timestamp,
    ROW_NUMBER() OVER(PARTITION BY p.person_id ORDER BY enc_timestamp DESC) AS rank
  FROM
    Encounter e
  INNER JOIN Person p
    ON e.enc_pat_drvs_number = p.pat_drvs_number
), 
recent_visit AS(
  SELECT person_id, 
    enc_number, 
    enc_orig_site_id 
    FROM 
      enc_ranked
    WHERE rank = 1)
SELECT 
  v.person_id, 
  v.enc_number, 
  v.enc_orig_site_id, 
  a.orig_site_id,
  CASE WHEN v.enc_orig_site_id = a.orig_site_id 
    THEN 'YES' 
    ELSE 'NO'
  END AS enc_atrb_orig_site
  FROM
    recent_visit v
  LEFT JOIN Attribution a
    ON v.person_id = a.person_id
  GROUP BY v.person_id

Table 4:

The first 4 rows of Table 4 are shown
person_id enc_number enc_orig_site_id orig_site_id enc_atrb_orig_site
00225091-B854-4B6D-8520-308C20E0FE2E 32354540 11 7 NO
006589B1-A5F7-493D-A86A-C5F5A6FC3767 31211784 11 2 NO
008394B7-47A5-4206-A641-D7116DB12B3B 32579131 11 11 YES
00BA341D-528C-465F-8C59-113DE481CBD1 28327946 11 11 YES

5. Identify the number of unique patients seen by each specialty.

SELECT
  e.enc_service_line_unscrubbed as specialty,
  COUNT(DISTINCT e.enc_pat_drvs_number) as unique_patients
FROM
  Encounter e
WHERE 
  e.enc_service_line_unscrubbed <> 'NULL' AND
  e.enc_service_line_unscrubbed <> 'Other'
GROUP BY enc_service_line_unscrubbed
ORDER BY unique_patients DESC

5a. Plot unique patients by specialty with a Pie Chart.

# Generate pie chart
pie_data <- q4_output[1:15,]

# Create plotly interactive pie chart
plot_ly(data = pie_data, labels = ~specialty, values = ~unique_patients, type = "pie",
        hoverinfo = "label+value", textinfo = "percent", textposition = "inside",
        insidetextfont = list(size = 14), marker = list(colors = rainbow(length(unique(pie_data$specialty))),
                                                         line = list(color = "#FFFFFF", width = 2))) %>%
  layout(title = "Unique Patients by Specialty",
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         showlegend = TRUE,
         legend = list(orientation = "v", x = 1.1, y = 0.5))