OSF Registration Badges

Published

September 15, 2025

Modified

October 22, 2025

Overview

This method measures lifecycle open science on OSF by looking at open science practices linked to open science registrations.

Definitions

OSR: An OSF Registration that is jointly open, non-deprecated and authentic

Openness Criteria:
1. Public (open-1)
2. ~~Not Embargoed (open-2)~~
Non-Deprecation Criteria:
1. Registered (nondeprecated-1)
2. Not Deleted (nondeprecated-2)
3. Not Retracted (nondeprecated-3)
Authenticity Criteria:
1. Not Spam (authentic-1)

LOS-Reg: An OSR that represents a lifecycle open science research project.

Lifecycle Opennness Criteria:
1. Output - The OSR has at least 1 linked Open Practice Resource from among: “Data”, “Analytic Code”, “Materials” or “Supplements”.
2. Outcome - The OSR has at least 1 linked “Papers” Open Practice Resource.

Results

See the code

# Packages
library(arrow)
library(ggplot2)
library(ggiraph)
library(glue)
library(gt)
library(dplyr)
library(lubridate)
library(plotly)
library(scales)
library(tidyr)
library(timetk)


# Modules
box::use(
  R / helpers[tidy_registry_names, tidy_template_names],
  R / plot[pivoter, factorizer, ts_prep],
  R / parameters[DATES, OUTPUTS, OUTCOMES]
)

OUTPUTS <- stringr::str_to_title(OUTPUTS)
OUTCOMES <- stringr::str_to_title(OUTCOMES)

# Local functions
table_helper <- function(tbl, title  = "Lifecycle Open Science Registrations", interactive = TRUE, ...) {
    gtbl <- tbl |>
        gt() |>
        tab_header(title = title,
        subtitle = paste0("as of ", MOST_RECENT_CHR)
    ) |>
    fmt_number(columns = c(OSR:`LOS-Reg`), decimals = 0) |>
    fmt_percent(columns = `LOS-Reg / OSR`, decimals = 2) |>
    tab_footnote(
        footnote = md("*OSR: Open Science Registration*"),
        locations = cells_column_labels(columns = OSR)
    ) |>
    tab_footnote(
        footnote = md("*LOS-Reg: Lifecycle Open Science Registration*"),
        locations = cells_column_labels(columns = `LOS-Reg`)
    ) |>
    opt_footnote_marks("letters") |>
    opt_row_striping(row_striping = TRUE) 

    if (interactive) {
        gtbl |>
            opt_interactive(...) |>
            opt_horizontal_padding(0)
    } else {
        gtbl
    }
}


tte_table <- function(tbl, title = "Time to Lifecycle Open Status (in days)", group_label = NULL, ...) {
  # Set title
  TITLE <- "Time to Lifecycle Open Status (in days)"
  if (!is.null(group_label)) {
    TITLE <- paste0(TITLE, " by ", stringr::str_to_title(group_label))
  }

  # Subset data
  tbl <- tbl |>
    select(..., event, n, p, mean, p50, sd, min, max) |>
    filter(tolower(event) %in% c("output", "outcome", "lifecycle")) |>
    mutate(
      event = stringr::str_to_title(event),
      event = factor(event, levels = c("Output", "Outcome", "Lifecycle"))
    ) |>
    arrange(event) |>
    mutate(event = as.character(event))

  # Format table
  gtbl <- tbl |>
    gt(row_group_as_column = TRUE) |>
    tab_header(
      title = TITLE,
      subtitle = "Among all Open Science Registrations (OSR)") |>
    cols_label(
      event = "Event",
      n = "Count",
      p = "Percent",
      mean = "Mean",
      p50 = "Median",
      sd = "Std. Dev.",
      min = "Min",
      max = "Max"
    ) |>
    #Spanners
    tab_spanner(columns = c(n, p), label = "Prevalance") |>
    tab_spanner(columns = c(mean, p50, sd, min, max), label = "Distribution") |>
    # Value formatting
    fmt_number(columns = c(n, mean, p50, sd, min, max), decimals = 0) |>
    fmt_percent(columns = p, decimals = 1)
  # Return table
  gtbl
}



# Data sources
all_ts <- read_parquet(here::here("data/registration_tsmonthly.parquet")) |>
    filter(date >= "2022-08-01")
registry_ts <- read_parquet(here::here("data/registration_registries_tsmonthly.parquet")) |>
    filter(date >= "2022-08-01") |>
    mutate(registry = tidy_registry_names(registry))
template_ts <- read_parquet(here::here("data/registration_templates_tsmonthly.parquet")) |>
    filter(date >= "2022-08-01") |>
    mutate(template = tidy_template_names(template)) |>
    group_by(date, template) |>
    summarise_all(sum) 

# Pivot for plots and assign labels
all_summary <- pivoter(all_ts) |>
    ts_prep()
registry_summary <- pivoter(registry_ts, registry) |>
    ts_prep()
template_summary <- pivoter(template_ts, template) |>
    ts_prep()

# Constants
MOST_RECENT_CHR <- max(all_ts$date)
MOST_RECENT <- ymd(MOST_RECENT_CHR)
TABLE_CRITERIA <- c(
    "los_plan", "los_outputs", "los_outcomes", "los_complete")
TABLE_NAMES <- c(
    "OSR", "OSR + Output(s)", "OSR + Outcome(s)","LOS-Reg"
)

Today

Here is the current state of LOS research projects on the OSF as of 2025-10-01. For all of the summary tables below, the following fields are included:

<GROUP_NAME>: An optional grouping variable for seeing disaggregated results (e.g., by registry, registration template, etc.)
OSR: Number of Open Science Registrations (i.e., Open + Non-deprecated + Authentic)
OSR + Output(s): Number of Open Science Registrations with at least one linked output resource
OSR + Outcome(s): Number of Open Science Registrations with at least one linked outcome resource
LOS-Reg: Lifecycle Open Science Registrations - number of Open Science Registrations with at least one linked outcome and at least one linked output resource
LOS-Reg / OSR: Percentage of Open Science OSF Registrations (OSR) that are Lifecycle Open Science Registrations (LOS-Reg)

Aggregate

See the code

overall_tbl <- all_summary |>
    filter(criteria %in% TABLE_CRITERIA) |>
    pivot_wider(names_from = criteria, values_from = n) 
names(overall_tbl) <-c("Date", TABLE_NAMES)

overall_tbl <- overall_tbl |>
    mutate(
        `LOS-Reg / OSR` = `LOS-Reg` / `OSR`,
    )

today_gtbl <- overall_tbl |>
    filter(Date == MOST_RECENT) |>
    select(-Date) |>
    table_helper("Lifecycle Open Science Registrations", use_pagination = FALSE, use_sorting = FALSE)

today_gtbl

Lifecycle Open Science Registrations

as of 2025-10-01

^a OSR: Open Science Registration

^b LOS-Reg: Lifecycle Open Science Registration

By Registry

See the code

registry_tbl <- registry_summary |>
    filter(criteria %in% TABLE_CRITERIA) |>
    pivot_wider(id_cols = c(date, registry), names_from = criteria, values_from = n) 
names(registry_tbl) <-c("Date", "Registry", TABLE_NAMES)

registry_tbl <- registry_tbl |>
    mutate(
        `LOS-Reg / OSR` = `LOS-Reg` / `OSR`,
    )

today_registry_gtbl <- registry_tbl |>
    ungroup() |>
    filter(Date == MOST_RECENT) |>
    arrange(desc(`LOS-Reg`)) |>
    select(-Date) |>
    table_helper("Lifecycle Open Science Registrations by Registry", use_pagination = FALSE)

today_registry_gtbl

Lifecycle Open Science Registrations by Registry

as of 2025-10-01

^a OSR: Open Science Registration

^b LOS-Reg: Lifecycle Open Science Registration

By Template

See the code

template_tbl <- template_summary |>
    filter(criteria %in% TABLE_CRITERIA) |>
    pivot_wider(id_cols = c(date, template), names_from = criteria, values_from = n) 
names(template_tbl) <-c("Date", "Template", TABLE_NAMES)

template_tbl <- template_tbl |>
    mutate(
        `LOS-Reg / OSR` = `LOS-Reg` / `OSR`
    )

today_template_gtbl <- template_tbl |>
    ungroup() |>
    filter(Date == MOST_RECENT) |>
    arrange(desc(`LOS-Reg`)) |>
    select(-Date) |>
    table_helper("Lifecycle Open Science Registrations by Template", use_search = TRUE)

today_template_gtbl

Lifecycle Open Science Registrations by Template

as of 2025-10-01

^a OSR: Open Science Registration

^b LOS-Reg: Lifecycle Open Science Registration

Time to Lifecycle Open Status

See the code

# TODO: Compute date of OSR status from logged event tables, not just current status
reg_tbl <- read_parquet(here::here("data/registration_current.parquet")) |>
    filter(is_osr == 1) |>
    select(node_id, registry, registered_date) 
reg_events <- read_parquet(here::here("data/registration_badges_time.parquet")) |>
    inner_join(reg_tbl, by = "node_id", relationship = "many-to-one")


#' Time to event summarizer
tte_summarizer <- function(tbl, t = time_to_event, x = event, ...) {
    # Totals
    df_n <- tbl |>
        summarize(
            .by = c(...),
            N = n_distinct(node_id)
        ) |>
        cross_join(distinct(tbl, {{ x }}))

    # Summary
    tbl |>
        summarise(
            .by = c({{ x }}, ...),
            n = sum(!is.na({{ t }})),
            mean = mean({{ t }}, na.rm = TRUE),
            p50 = median({{ t }}, na.rm = TRUE),
            sd = sd({{ t }}, na.rm = TRUE),
            min = min({{ t }}, na.rm = TRUE),
            max = max({{ t }}, na.rm = TRUE),
        ) |>
        left_join(df_n) |>
        mutate(
            p = n / N
        ) |>
        select(..., {{ x }}, n, p, mean, p50, sd, min, max)
}

Unless otherwise stated, the starting sample for these analyses is all Open Science Registrations (OSR). As of 2025-10-01, there are 1,988 Open Science Registrations (OSR) in the OSF.

See the code

# Long form (among earliest connected resources of each type)
df_long <- reg_events |>
    select(node_id, registered_date, date_lifecycle, starts_with("date1_")) |>
    rename(date1_lifecycle = date_lifecycle) |>
    pivot_longer(
        cols = dplyr::matches("date1_"),
        names_to = "event",
        names_prefix = "date1_",
        values_to = "date"
    ) |>
    arrange(node_id, date)

#  Add sequencing
df_sequence <- df_long |>
    filter(event %in% c(tolower(OUTCOMES), tolower(OUTPUTS)) & !is.na(date)) |>
    group_by(node_id) |>
    arrange(date) |>
    mutate(
        sequence = row_number()
    ) |>
    ungroup() |>
    select(node_id, event, sequence)

# Time-to-Event
df_tte <- df_long |>
    left_join(df_sequence) |>
    mutate(time_to_event = as.numeric(date - registered_date, units = "days"))

The following table summarizes how long it takes for an OSR to become fully lifecycle open. Each row corresponds to three different milestones in the research project lifecycle:

The first linked output resource
The first linked outcome resource
The achievement of lifecycle open status

The “Prevalance” columns report the number and percentage of OSRs that have reached each milestone, while the “Distribution” columns provide summary statistics characterizing the center, spread, and range of time to each milestone (in days).

See the code

df_summary <- tte_summarizer(df_tte, t = time_to_event, x = event)
tte_table(df_summary)

Time to Lifecycle Open Status (in days)
Among all Open Science Registrations (OSR)
Event	Prevalance		Distribution
Event	Count	Percent	Mean	Median	Std. Dev.	Min	Max
Output	1,452	73.0%	449	355	442	0	3,270
Outcome	892	44.9%	648	564	491	−109	3,270
Lifecycle	356	17.9%	662	561	468	0	3,270

The following interactive graphic provides the same information via a boxplot.

See the code

p <- df_tte |>
    filter(event %in% c("output", "outcome", "lifecycle")) |>
    mutate(
        event = stringr::str_to_title(event),
        event = factor(event, levels = c("Output", "Outcome", "Lifecycle"))
    ) |>
    ggplot() +
    geom_boxplot_interactive(
        aes(
            x = event,
            y = time_to_event,
            fill = event,
            data_id = event,
            group = event,
            tooltip = after_stat({
                paste0(
                    "Event: ", stringr::str_to_title(.data$fill),
                    "\n",
                    "\nMax: ", comma(.data$max),
                    "\nQ3: ", comma(.data$upper),
                    "\nMedian: ", comma(round(.data$middle)),
                    "\nQ1: ", comma(.data$lower),
                    "\nMin: ", comma(round(.data$min))
                )
            }),
            outlier.tooltip = paste(
                "Days: ", comma(round(time_to_event)))
            )
        ) +
        guides(fill = "none") +
        scale_y_continuous(labels = label_comma()) +
        labs(
            y = "Days",
            x = "Event",
            title = "Time to Lifecycle Open Status (in days)",
        ) +
        theme_minimal()
g <- girafe(ggobj = p)
g

We can disaggreate our findings further by looking at the prevalance and distribution of time for individual resources (code, data, materials, supplements, and papers). As before, we present results via a table and an interactive boxplot.

See the code

df_summary |>
    mutate(
        event = stringr::str_to_title(event)
    ) |>
    filter(event %in% c("Code", "Data", "Materials", "Supplements", "Papers")) |>
    gt() |>
    # Title
    tab_header(
        title = "Time to First Linked Resource (in days)",
        subtitle = paste0("Among all Open Science Registrations (OSR)")
    ) |>
    # Column labels
    cols_label(
        event = "Event",
        n = "Count",
        p = "Percent",
        mean = "Mean",
        p50 = "Median",
        sd = "Std. Dev.",
        min = "Min",
        max = "Max"
    ) |>
    # Spanners
    tab_spanner(columns = c(n, p), label = "Prevalance") |>
    tab_spanner(columns = c(mean, p50, sd, min, max), label = "Distribution") |>
    # Value formatting
    fmt_number(columns = c(n, mean, p50, sd, min, max), decimals = 0) |>
    fmt_percent(columns = p, decimals = 1) |>
    opt_interactive(use_search = FALSE, use_pagination = FALSE) |>
    opt_horizontal_padding(0)

Time to First Linked Resource (in days)

Among all Open Science Registrations (OSR)

See the code

p <- df_tte |>
    filter(event %in% c("code", "data", "materials", "supplements", "papers")) |>
    mutate(
        event = stringr::str_to_title(event)
    ) |>
    ggplot() +
    geom_boxplot_interactive(
        aes(
            x = event,
            y = time_to_event,
            fill = event,
            data_id = event,
            group = event,
            tooltip = after_stat({
                paste0(
                    "Event: ", stringr::str_to_title(.data$fill),
                    "\n",
                    "\nMax: ", comma(.data$max),
                    "\nQ3: ", comma(.data$upper),
                    "\nMedian: ", comma(round(.data$middle)),
                    "\nQ1: ", comma(.data$lower),
                    "\nMin: ", comma(round(.data$min))
                )
            }),
            outlier.tooltip = paste(
                "Days: ", comma(round(time_to_event)))
            )
        ) +
        guides(fill = "none") +
        scale_y_continuous(labels = label_comma()) +
        labs(
            y = "Days",
            x = "Event",
            title = "Time to First Linked Resource (in days)",
        ) +
        theme_minimal()
g <- girafe(ggobj = p)
g

The following tables looks at differences in time to lifecycle open science moments (first output, first outcome, and lifeycle open) by registry. To ease interpretion, rows with zero counts are omitted. Results can be sorted along any column and the search box can be used to filter results.

See the code

df_summary_registry <- df_tte |>
    left_join(select(reg_tbl, node_id, registry), by = "node_id") |>
    tte_summarizer(t = time_to_event, x = event, registry) |>
    mutate(registry = tidy_registry_names(registry)) |>
    filter(n > 0) |>
    rename(Registry = registry) 

tte_table(tbl = df_summary_registry, title = "Time to Lifecycle Open Status by Registry (in days)", group_label = "Registry", Registry) |>
    opt_interactive(use_search = TRUE, use_pagination = FALSE) |>
    opt_horizontal_padding(0)

Time to Lifecycle Open Status (in days) by Registry

Among all Open Science Registrations (OSR)

Sequencing

There are several potential insights related to the sequencing of resource connections. The following table summarizes sequencing patterns for OSRs with at least two linked resources. The rows correspond to resource types (code, data, materials, supplements, and papers), while the columns sequencing order.

For instance, when looking within the “First” column, we see that Data is most frequently linked first (41%), followed by Papers (31%).

See the code

# Data
df_summary <- df_tte |>
    filter(event %in% c("code", "data", "materials", "supplements", "papers")) |> 
    filter(!is.na(sequence)) |>
    tte_summarizer(t = sequence, x = event, sequence) |>
    mutate(
        event = stringr::str_to_title(event)
    ) |>
    arrange(sequence, event) |>
    select(event, sequence, n, p) |>
    pivot_wider(
        id_cols = event,
        names_from = sequence,
        values_from = c(n, p),
        names_glue = "{.value}_{sequence}",
        values_fill = 0
    ) |>
    select(event, n_1, p_1, n_2, p_2, n_3, p_3, n_4, p_4, n_5, p_5) |>
    mutate(
        event = stringr::str_to_title(event)
    )

# Table
df_summary |>
    gt(rowname_col = "event") |>
    tab_header(
        title = "Sequence of Connected Resources",
        subtitle = paste0("Among all Open Science Registrations (OSR) [n = ", format(n_distinct(df_sequence$node_id), big.mark = ","), "]")) |>
    cols_label(
        starts_with("n_") ~ "Count",
        starts_with("p_") ~ "Percent"
    ) |>
    # Spanners
    tab_spanner(
        columns = ends_with("1"),
        label = md(glue("**First**<br>(n={comma(sum(df_summary$n_1))})"))) |>
    tab_spanner(
        columns = ends_with("2"),
        label = md(glue("**Second**<br>(n={comma(sum(df_summary$n_2))})"))) |>
    tab_spanner(
        columns = ends_with("3"),
        label = md(glue("**Third**<br>(n={comma(sum(df_summary$n_3))})"))) |>
    tab_spanner(
        columns = ends_with("4"),
        label = md(glue("**Fourth**<br>(n={comma(sum(df_summary$n_4))})"))) |>
    tab_spanner(
        columns = ends_with("5"),
        label = md(glue("**Fifth**<br>(n={comma(sum(df_summary$n_5))})"))) |>
    # Row groups
    tab_row_group(
        label = "Outputs",
        rows = OUTPUTS) |>
    tab_row_group(
        label = "Outcomes",
        rows = OUTCOMES
    ) |>
    tab_stub_indent(rows = c(OUTPUTS, OUTCOMES), indent = 3) |>
    tab_options(
        row_group.background.color = "gray95"
    ) |>
    row_group_order(
        groups = c("Outputs", "Outcomes")
    ) |>
    # Value formatting
    fmt_number(columns = starts_with("n_"), decimals = 0) |>
    fmt_percent(columns = starts_with("p_"), decimals = 1) |>
    opt_interactive()

Sequence of Connected Resources

Among all Open Science Registrations (OSR) [n = 1,988]

Resource Sequencing and Latency

A third potential line of inquiry explores how/whether sequence order relates to overall latency times for resource connections of different types. Due to the limited number of OSRs with all five or at least four linked resources of any type, the following table summarizes latency times by resource type and sequence order.

IN PROGRESS

See the code

# Recompute time to event based on distance to last event
# df_summary <- df_tte |>
#     filter(event %in% c("code", "data", "materials", "supplements", "papers")) |> 
#     filter(!is.na(sequence) & sequence < 4) |>
#     group_by(node_id, event) |>
#     arrange(sequence) |>
#     mutate(
#         benchmark_date = ifelse(sequence == 1, registered_date, date),
#         time_to_event = as.numeric(date - benchmark_date, units = "days")
#     ) |>
#     ungroup()
#     tte_summarizer(x = event, t = time_to_event, event, sequence) |>
#     distinct(event, sequence, mean, p50, sd, min, max, .keep_all = TRUE) |>
#     select(-p) |>
#     arrange(event, sequence) |>
#     group_by(event) |>
#     mutate(
#         median_diff = if_else(
#             sequence == 1, p50,
#             p50 - lag(p50)),
#         mean_diff = mean - lag(mean)
#     )
# df_summary

Time Series

We can also examine the efficiency of Lifecycle Open Science Registrations over time. By efficiency, we mean the ratio of Lifecycle Open Science Registrations to Open Science Registrations. In other words, the percentage of Open Science Registrations that meet criteria for Lifecycle Open Science.

See the code

overall_tbl |>
    select(Date, `LOS-Reg / OSR`) |>
    pivot_longer(
        cols = c(`LOS-Reg / OSR`),
        names_to = "Metric", values_to = "Percentage") |>
    plot_time_series(Date, Percentage, .smooth = FALSE, .title = "Efficiency of Lifecycle Open Science Registrations")

We can also plot these percentages over time for each registry. Because of the disparity in the number of registrations between registries, we calculate the percentage of registrations that meeting Open Science Registration criteria. This is the LOS-Reg / OSR netric from the previous table.

See the code

registry_ptbl <- registry_tbl |>
    select(Date, Registry, `LOS-Reg / OSR`) |>
    pivot_longer(
        cols = c(`LOS-Reg / OSR`),
        names_to = "Metric", values_to = "Percentage") |>
    ungroup()

registry_ptbl |>
    filter(Metric == "LOS-Reg / OSR") |>
    plot_time_series(Date, Percentage, .color_var = Registry,
    .smooth = FALSE, .title = "Efficiency of Lifecycle Open Science Registrations by Registry")

CHANGELOG

2025-10-22
- Revisions to time-to-event analyses
- Preprints-forward recipe pending
2025-10-17
- Initial time-to-event analyses
- Revisions to README.md
2025-10-03
- Disaggreagate by OSF Registration template
- UI updates
- Consolidated repository struture (data processing + presentation layers)
2025-10-01
- Align operationalization of “outputs” and “outcomes” with organizational definitions
- Drop counts and computations involving “Total” (i.e.,unfiltered) registrations
- Drop non-embargoed requirement from the “Openness” criteria for OSR status
2023-09-15: Initial version