Add survey analysis scripts

Co-authored-by: Bryan Wilkinson <bwilkinson@minerkasch.com> Co-authored-by: Bob Killen <bob.killen@linux.com>
author: Bob Killen <bob.killen@linux.com> 2020-03-30 10:50:35 -0400
committer: Bob Killen <bob.killen@linux.com> 2020-03-30 16:13:53 -0400
commit: 1a24e7bb4da9755f2ed3b65a20c9197c5b61ddc4 (patch)
tree: 843063b339a82d55a8fb007abe3199a5c14328e8
parent: 9bd7de0d8dd8f21606cbad6f77edc6d02e14a42e (diff)
4 files changed, 1263 insertions, 0 deletions
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py
new file mode 100644
index 00000000..ef2c436b
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py
@@ -0,0 +1,1028 @@
+from textwrap import wrap
+import math
+import plotnine as p9
+import pandas as pd
+import textwrap
+from textwrap import shorten
+from matplotlib import pyplot as plt
+from copy import copy
+
+
+from mizani.palettes import brewer_pal
+from plotnine.scales.scale import scale_discrete
+
+# Custom scales for plotnine that reverse the direction of the colors
+class reverse_scale_color_brewer(p9.scale_color_brewer):
+    def __init__(self, type="seq", palette=1, direction=-1, **kwargs):
+        self.palette = brewer_pal(type, palette, direction)
+        scale_discrete.__init__(self, **kwargs)
+
+
+class reverse_scale_fill_brewer(p9.scale_fill_brewer):
+    def __init__(self, type="seq", palette=1, direction=-1, **kwargs):
+        self.palette = brewer_pal(type, palette, direction)
+        scale_discrete.__init__(self, **kwargs)
+
+
+def split_for_likert(topic_data_long, mid_point):
+    """
+    Returns the aggregated counts for ratings in the top and bottom halves of 
+    the of each category, necssary for making offset bar charts
+
+    Args:
+        topic_data_long (pandas.Dataframe): A pandas Dataframe storing each respondents 
+        ratings for a given topic, in long format
+        mid_point (int): The midpoint to use to split the into two halves, based on ratings
+
+    Returns:
+        (tuple): Tuple containing:
+            (pandas.DataFrame): Aggregated counts for ratings greater than or equal to the midpoinnt
+            (pandas.DataFrame): Aggregated counts for ratings less than or equal to the midpoinnt 
+    """
+    x = topic_data_long.columns.tolist()
+    x.remove("level_1")
+
+    top_cutoff = topic_data_long["rating"] >= mid_point
+    bottom_cutoff = topic_data_long["rating"] <= mid_point
+
+    top_scores = (
+        topic_data_long[top_cutoff]
+        .groupby(x)
+        .count()
+        .reindex(
+            pd.MultiIndex.from_product(
+                [topic_data_long[y].unique().tolist() for y in x], names=x
+            ),
+            fill_value=0,
+        )
+        .reset_index()
+        .sort_index(ascending=False)
+    )
+
+    # The mid point is in both the top and bottom halves, so divide by two
+    top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = (
+        top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0
+    )
+
+    bottom_scores = (
+        topic_data_long[bottom_cutoff]
+        .groupby(x)
+        .count()
+        .reindex(
+            pd.MultiIndex.from_product(
+                [topic_data_long[y].unique().tolist() for y in x], names=x
+            ),
+            fill_value=0,
+        )
+        .reset_index()
+    )
+
+    # The mid point is in both the top and bottom halves, so divide by two
+    bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = (
+        bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0
+    )
+
+    return top_scores, bottom_scores
+
+
+def make_long(data, facets, multi_year=False):
+    """Converts a wide dataframe with columns for each topic's rating into a long dataframe
+
+    Args:
+        data (pandas.DataFrame): A wide dataframe
+        facets (list): List of columns to keep as their own column
+        mulit_year (bool, optional) Defaults to False. If True, add the "year" column to the list of facets
+
+    Returns:
+        (pandas.DataFrame): Long dataframe 
+
+    """
+
+    facets = copy(facets)
+    if multi_year:
+        facets.append("year")
+    long_data = data.set_index(facets, append=True).stack().reset_index()
+
+    # Rename so Level_0 always has the values of the topic we are interested in
+    long_data = long_data.rename(
+        columns={
+            "level_0": "level_1",
+            "level_4": "level_0",
+            "level_3": "level_0",
+            "level_2": "level_0",
+            0: "rating",
+        }
+    )
+    long_data = long_data.assign(
+        level_0=pd.Categorical(long_data.level_0, ordered=True)
+    )
+    return long_data
+
+
+def get_data_subset(
+    survey_data, topic, facets=[], exclude_new_contributors=False, include_year=False
+):
+    """Get only the relevant columns from the data
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facets (list, optional): List of columns use for grouping
+        exclude_new_contributors: (bool, optional) Defaults to False. If True, remove 
+            all responses from contributors who have been involved a year or less.  
+        include_year: (bool, optional) Defaults to False. If True, include the year column
+            in the output
+
+    Returns:
+        (pandas.DataFrame): Survey dataframe with only columns relevant to the topics
+            and facets remaining.
+    """
+
+    og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+    facets = copy(facets)
+    if include_year:
+        facets.append("year")
+    if facets:
+        if "." in facets:
+            facets.remove(".")
+            cols = og_cols + facets
+            facets.append(".")
+        else:
+            cols = og_cols + facets
+    else:
+        cols = og_cols
+
+    if exclude_new_contributors:
+        topic_data = survey_data[
+            survey_data["Contributing_Length"] != "less than one year"
+        ][cols]
+    else:
+        topic_data = survey_data[cols]
+
+    return topic_data
+
+
+def get_multi_year_data_subset(
+    survey_data, topic, facet_by=[], exclude_new_contributors=False
+):
+    """Get appropriate data for multi-year plots and convert it to long form
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facet_by (list, optional): List of columns use for grouping
+        exclude_new_contributors (bool, optional) Defaults to False. If True, remove 
+            all responses from contributors who have been involved a year or less.  
+
+    Returns:
+        (pandas.DataFrame): Long dataframe 
+    """
+    topic_data = get_data_subset(
+        survey_data, topic, facet_by, exclude_new_contributors, include_year=True
+    )
+
+    if facet_by:
+        if "." in facet_by:
+            facet_by.remove(".")
+            topic_data_long = make_long(topic_data, facet_by, multi_year=True)
+            facet_by.append(".")
+        else:
+            topic_data_long = make_long(topic_data, facet_by, multi_year=True)
+
+    else:
+        topic_data_long = make_long(topic_data, [], multi_year=True)
+
+    return topic_data_long
+
+
+def get_single_year_data_subset(survey_data, topic, facet_by=[]):
+    """Get appropriate data for single-year plots and convert it to long form
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facet_by (list, optional): List of columns use for grouping
+
+    Returns:
+        (pandas.DataFrame): Long dataframe 
+
+    """
+    topic_data = get_data_subset(survey_data, topic, facet_by)
+
+    if facet_by:
+        if "." in facet_by:
+            facet_by.remove(".")
+            topic_data_long = make_long(topic_data, facet_by)
+            facet_by.append(".")
+        else:
+            topic_data_long = make_long(topic_data, facet_by)
+    else:
+
+        topic_data_long = (
+            topic_data.unstack().reset_index().rename(columns={0: "rating"})
+        )
+        topic_data_long = topic_data_long.assign(
+            level_0=pd.Categorical(topic_data_long.level_0, ordered=True)
+        )
+
+    return topic_data_long
+
+
+def make_bar_chart_multi_year(
+    survey_data, topic, facet_by=[], exclude_new_contributors=False
+):
+    """Make a barchart showing proportions of respondents listing each 
+        column that starts with topic. Bars are colored by which year of 
+        the survey they correspond to. If facet_by is not empty, the resulting
+        plot will be faceted into subplots by the variables given. 
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facet_by (list,optional): List of columns use for grouping
+        exclude_new_contributors (bool, optiona ): Defaults to False. If True,
+            do not include any responses from contributors with less than 
+            one year of experience
+
+    Returns:
+        (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+
+    """
+    topic_data = get_data_subset(
+        survey_data, topic, facet_by, exclude_new_contributors, include_year=True
+    )
+
+    if facet_by:
+        fix = False
+        if "." in facet_by:
+            facet_by.remove(".")
+            fix = True
+        agg = (
+            topic_data.groupby(facet_by + ["year"])
+            .sum()
+            .reset_index()
+            .melt(id_vars=facet_by + ["year"])
+        )
+        totals = (
+            topic_data.groupby(facet_by + ["year"])
+            .count()
+            .reset_index()
+            .melt(id_vars=facet_by + ["year"])
+        )
+        percent = agg.merge(totals, on=facet_by + ["year", "variable"])
+
+        if fix:
+            facet_by.append(".")
+
+    else:
+        agg = topic_data.groupby(["year"]).sum().reset_index().melt(id_vars=["year"])
+        totals = (
+            topic_data.groupby(["year"]).count().reset_index().melt(id_vars=["year"])
+        )
+        percent = agg.merge(totals, on=["year", "variable"])
+
+    # This plot is always done proportionally
+    percent = percent.assign(value=percent["value_x"] / percent["value_y"])
+    percent = percent.assign(variable=pd.Categorical(percent.variable, ordered=True))
+
+    br = (
+        p9.ggplot(percent, p9.aes(x="variable", fill="factor(year)", y="value"))
+        + p9.geom_bar(show_legend=True, position="dodge", stat="identity")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=sorted(percent["variable"].unique().tolist()),
+            labels=[
+                shorten(
+                    x.replace(topic, "").replace("_", " "), placeholder="...", width=30
+                )
+                for x in sorted(percent["variable"].unique().tolist())
+            ],
+        )
+    )
+
+    # Uncomment to return dataframe instead of plot
+    # return percent
+
+    if facet_by:
+        br = (
+            br
+            + p9.facet_grid(
+                facet_by,
+                shrink=False,
+                labeller=lambda x: "\n".join(wrap(x.replace("/", "/ "), 15)),
+            )
+            + p9.theme(
+                strip_text_x=p9.element_text(wrap=True, va="bottom", margin={"b": -0.5})
+            )
+        )
+    return br
+
+
+def make_single_bar_chart_multi_year(survey_data, column, facet, proportionally=False):
+    """Make a barchart showing the number of respondents responding to a single column.
+        Bars are colored by which year of the survey they correspond to. If facet
+        is not empty, the resulting plot will be faceted into subplots by the variables
+        given. 
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        column (str): Column to plot responses to
+        facet (list,optional): List of columns use for grouping
+        proportionally (bool, optiona ): Defaults to False. If True,
+            the bars heights are determined proportionally to the 
+            total number of responses in that facet. 
+
+    Returns:
+        (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+
+    """
+    cols = [column, facet]
+    show_legend = False
+    topic_data = survey_data[cols + ["year"]]
+
+    topic_data_long = make_long(topic_data, facet, multi_year=True)
+
+    if proportionally:
+        proportions = (
+            topic_data_long[topic_data_long.rating == 1].groupby(facet + ["year"]).sum()
+            / topic_data_long.groupby(facet + ["year"]).sum()
+        ).reset_index()
+    else:
+        proportions = (
+            topic_data_long[topic_data_long.rating == 1]
+            .groupby(facet + ["year"])
+            .count()
+            .reset_index()
+        )
+
+    x = topic_data_long.columns.tolist()
+    x.remove("level_1")
+
+    ## Uncomment to return dataframe instead of plot
+    # return proportions
+
+    return (
+        p9.ggplot(proportions, p9.aes(x=facet, fill="year", y="level_1"))
+        + p9.geom_bar(show_legend=show_legend, stat="identity")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=topic_data_long[facet].unique().tolist(),
+            labels=[
+                x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+            ],
+        )
+    )
+
+
+def make_likert_chart_multi_year(
+    survey_data,
+    topic,
+    labels,
+    facet_by=[],
+    five_is_high=False,
+    exclude_new_contributors=False,
+):
+    """Make an offset stacked barchart showing the number of respondents at each rank or value for 
+        all columns in the topic. Each column in the topic is a facet, with the years displayed
+        along the x-axis.
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with
+        labels (list): List of strings to use as labels, corresponding
+             to the numerical values given by the respondents.
+        facet_by (list,optional): List of columns use for grouping
+        five_is_high (bool, optiona ): Defaults to False. If True,
+            five is considered the highest value in a ranking, otherwise 
+            it is taken as the lowest value.
+        exclude_new_contributors (bool, optional): Defaults to False. If True,
+            do not include any responses from contributors with less than 
+            one year of experience        
+
+    Returns:
+        (plotnine.ggplot): Offset stacked barchart plot object which 
+            can be displayed in a notebook or saved out to a file
+    """
+
+    facet_by = copy(facet_by)
+    og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+    show_legend = True
+
+    topic_data_long = get_multi_year_data_subset(
+        survey_data, topic, facet_by, exclude_new_contributors
+    )
+
+    if not five_is_high:
+        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+
+    mid_point = 3 if five_is_high else -3
+    top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)
+
+    if facet_by:
+        fix = False
+        if "." in facet_by:
+            facet_by.remove(".")
+            fix = True
+
+        # Calculate proportion for each rank
+        top_scores = top_scores.merge(
+            topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
+            on=facet_by + ["year"],
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        top_scores = top_scores.assign(
+            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+        )
+
+        bottom_scores = bottom_scores.merge(
+            topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
+            on=facet_by + ["year"],
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        bottom_scores = bottom_scores.assign(
+            level_1=bottom_scores.level_1_x
+            * -1
+            / (bottom_scores.level_1_y / len(og_cols))
+        )
+
+        if fix:
+            facet_by.append(".")
+    else:
+        # Calculate proportion for each rank
+        top_scores = top_scores.merge(
+            topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        top_scores = top_scores.assign(
+            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+        )
+
+        bottom_scores = bottom_scores.merge(
+            topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        bottom_scores = bottom_scores.assign(
+            level_1=bottom_scores.level_1_x
+            * -1
+            / (bottom_scores.level_1_y / len(og_cols))
+        )
+
+    vp = (
+        p9.ggplot(
+            topic_data_long,
+            p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"),
+        )
+        + p9.geom_col(
+            data=top_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+            position=p9.position_stack(reverse=True),
+        )
+        + p9.geom_col(
+            data=bottom_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+            position=p9.position_stack(),
+        )
+        + p9.geom_hline(yintercept=0, color="white")
+    )
+
+    if five_is_high:
+        vp = (
+            vp
+            + p9.scale_color_brewer(
+                "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels
+            )
+            + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels)
+            + p9.theme(
+                axis_text_x=p9.element_text(angle=45, ha="right"),
+                strip_text_y=p9.element_text(angle=0, ha="left"),
+            )
+        )
+    else:
+        vp = (
+            vp
+            + p9.scale_color_brewer(
+                "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
+            )
+            + p9.scale_fill_brewer(
+                "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
+            )
+            + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left"))
+        )
+
+    if facet_by:
+        facet_by.remove(".")
+
+    else:
+        facet_by.append(".")
+
+    vp = (
+        vp
+        + p9.facet_grid(
+            facet_by + ["level_0"],
+            labeller=lambda x: "\n".join(
+                wrap(
+                    x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(),
+                    15,
+                )
+            ),
+        )
+        + p9.theme(
+            strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1
+        )
+    )
+
+    return vp
+
+
+def make_bar_chart(survey_data, topic, facet_by=[], proportional=False):
+    """Make a barchart showing the number of respondents listing each 
+        column that starts with topic for a single year. If facet_by is
+        not empty, the resulting plot will be faceted into subplots 
+        by the variables given. 
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facet_by (list,optional): List of columns use for grouping
+        proportional (bool, optiona ): Defaults to False. If True,
+            the bars heights are determined proportionally to the 
+            total number of responses in that facet. 
+
+    Returns:
+        (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+    """
+    show_legend = False
+    if facet_by:
+        show_legend = True
+
+    topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by)
+
+    x = topic_data_long.columns.tolist()
+    x.remove("level_1")
+
+    if facet_by:
+        period = False
+        if "." in facet_by:
+            facet_by.remove(".")
+            period = True
+
+        aggregate_data = (
+            topic_data_long[topic_data_long.rating == 1]
+            .dropna()
+            .groupby(["level_0"] + facet_by)
+            .count()
+            .reset_index()
+        )
+
+        if period:
+            facet_by.append(".")
+
+    else:
+        aggregate_data = (
+            topic_data_long[topic_data_long.rating == 1]
+            .dropna()
+            .groupby("level_0")
+            .count()
+            .reset_index()
+        )
+
+    if proportional and facet_by:
+        period = False
+        if "." in facet_by:
+            facet_by.remove(".")
+            period = True
+
+        facet_sums = (
+            topic_data_long[topic_data_long.rating == 1]
+            .dropna()
+            .groupby(facet_by)
+            .count()
+            .reset_index()
+        )
+
+        aggregate_data = aggregate_data.merge(facet_sums, on=facet_by).rename(
+            columns={"level_0_x": "level_0"}
+        )
+        aggregate_data = aggregate_data.assign(
+            rating=aggregate_data.rating_x / aggregate_data.rating_y
+        )
+
+        if period:
+            facet_by.append(".")
+
+    br = (
+        p9.ggplot(aggregate_data, p9.aes(x="level_0", fill="level_0", y="rating"))
+        + p9.geom_bar(show_legend=show_legend, stat="identity")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=topic_data_long["level_0"].unique().tolist(),
+            labels=[
+                "\n".join(
+                    textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2]
+                )
+                for x in topic_data_long["level_0"].unique().tolist()
+            ],
+        )
+    )
+
+    if facet_by:
+        br = (
+            br
+            + p9.facet_grid(
+                facet_by, shrink=False, labeller=lambda x: "\n".join(wrap(x, 15))
+            )
+            + p9.theme(
+                axis_text_x=p9.element_blank(),
+                strip_text_x=p9.element_text(
+                    wrap=True, va="bottom", margin={"b": -0.5}
+                ),
+            )
+            + p9.scale_fill_discrete(
+                limits=topic_data_long["level_0"].unique().tolist(),
+                labels=[
+                    "\n".join(
+                        wrap(
+                            x.replace(topic, "")
+                            .replace("_", " ")
+                            .replace("/", "/  ")
+                            .strip(),
+                            30,
+                        )
+                    )
+                    for x in topic_data_long["level_0"].unique().tolist()
+                ],
+            )
+        )
+    return br
+
+
+def make_likert_chart(
+    survey_data,
+    topic,
+    labels,
+    facet_by=[],
+    max_value=5,
+    max_is_high=False,
+    wrap_facets=True,
+    sort_x=False,
+):
+    """Make an offset stacked barchart showing the number of respondents at each rank or value for 
+        all columns in the topic. Each column in the original data is a tick on the x-axis
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with
+        labels (list): List of strings to use as labels, corresponding
+             to the numerical values given by the respondents.
+        facet_by (list,optional): List of columns use for grouping 
+        max_value (int, optional):  Defaults to 5. The maximuum value a respondent can assign.
+        max_is_high (bool, optiona ): Defaults to False. If True,
+            the max_value is considered the highest value in a ranking, otherwise 
+            it is taken as the lowest value.
+        wrap_facets (bool, optional): Defaults to True. If True, the facet labels are 
+            wrapped
+        sort_x  (bool, optional): Defaults to False. If True, the x-axis is sorted by the 
+            mean value for each column in the original data 
+
+    Returns:
+        (plotnine.ggplot): Offset stacked barchart plot object which 
+            can be displayed in a notebook or saved out to a file
+    """
+
+    mid_point = math.ceil(max_value / 2)
+
+    og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+    show_legend = True
+
+    topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by)
+
+    if not max_is_high:
+        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+
+        mid_point = -1 * mid_point
+
+    top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)
+
+    if facet_by:
+        fix = False
+        if "." in facet_by:
+            facet_by.remove(".")
+            fix = True
+
+        top_scores = top_scores.merge(
+            topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        top_scores = top_scores.assign(
+            level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+        )
+
+        bottom_scores = bottom_scores.merge(
+            topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
+        ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+        bottom_scores = bottom_scores.assign(
+            level_1=bottom_scores.level_1_x
+            * -1
+            / (bottom_scores.level_1_y / len(og_cols))
+        )
+
+        if fix:
+            facet_by.append(".")
+
+    else:
+        bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1)
+
+    if sort_x:
+        x_sort_order = (
+            topic_data_long.groupby("level_0")
+            .mean()
+            .sort_values("rating")
+            .reset_index()["level_0"]
+            .values.tolist()
+        )
+        x_sort_order.reverse()
+    else:
+        x_sort_order = topic_data_long["level_0"].unique().tolist()
+
+    vp = (
+        p9.ggplot(
+            topic_data_long,
+            p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"),
+        )
+        + p9.geom_col(
+            data=top_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+            position=p9.position_stack(reverse=True),
+        )
+        + p9.geom_col(
+            data=bottom_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+            position=p9.position_stack(),
+        )
+        + p9.geom_hline(yintercept=0, color="white")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=x_sort_order,
+            labels=[
+                "\n".join(
+                    textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2]
+                )
+                for x in x_sort_order
+            ],
+        )
+    )
+
+    if max_is_high:
+        vp = (
+            vp
+            + p9.scale_color_brewer(
+                "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
+            )
+            + p9.scale_fill_brewer(
+                "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
+            )
+        )
+
+    else:
+        vp = (
+            vp
+            + reverse_scale_fill_brewer(
+                "div",
+                "RdBu",
+                limits=list(reversed(range(-max_value, 0))),
+                labels=labels,
+            )
+            + reverse_scale_color_brewer(
+                "div",
+                "RdBu",
+                limits=list(reversed(range(-max_value, 0))),
+                labels=labels,
+            )
+        )
+
+    if facet_by:
+        if wrap_facets:
+            vp = (
+                vp
+                + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15)))
+                + p9.theme(
+                    strip_text_x=p9.element_text(
+                        wrap=True, va="bottom", margin={"b": -0.5}
+                    )
+                )
+            )
+        else:
+            vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x)
+    return vp
+
+
+def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False):
+    """Make an offset stacked barchart showing the number of respondents at each rank 
+        or value for a single columns in the original data. Each facet is shown as
+        a tick on the x-axis
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with
+        labels (list): List of strings to use as labels, corresponding
+             to the numerical values given by the respondents.
+        facet (str): Column used for grouping 
+        five_is_high (bool, optionalc): Defaults to False. If True,
+            5 is considered the highest value in a ranking, otherwise 
+            it is taken as the lowest value.
+
+    Returns:
+        (plotnine.ggplot): Offset stacked barchart plot object which 
+            can be displayed in a notebook or saved out to a file
+    """
+    mid_point = 3
+    cols = [column, facet]
+    show_legend = True
+    topic_data = survey_data[cols]
+
+    topic_data_long = make_long(topic_data, facet)
+
+    if not five_is_high:
+        topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+    x = topic_data_long.columns.tolist()
+    x.remove("level_1")
+    x.remove("level_0")
+
+    if not five_is_high:
+        mid_point *= -1
+
+    top_cutoff = topic_data_long["rating"] >= mid_point
+    bottom_cutoff = topic_data_long["rating"] <= mid_point
+
+    top_scores = (
+        topic_data_long[top_cutoff]
+        .groupby(x)
+        .count()
+        .reset_index()
+        .sort_index(ascending=False)
+    )
+
+    top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = (
+        top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0
+    )
+    top_scores = top_scores.merge(
+        topic_data_long.groupby(facet).count().reset_index(), on=facet
+    )
+    top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y)
+
+    bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index()
+    bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = (
+        bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0
+    )
+    bottom_scores = bottom_scores.merge(
+        topic_data_long.groupby(facet).count().reset_index(), on=facet
+    )
+    bottom_scores = bottom_scores.assign(
+        level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y
+    )
+
+    vp = (
+        p9.ggplot(
+            topic_data_long,
+            p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"),
+        )
+        + p9.geom_col(
+            data=top_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+            position=p9.position_stack(reverse=True),
+        )
+        + p9.geom_col(
+            data=bottom_scores,
+            mapping=p9.aes(y="level_1"),
+            show_legend=show_legend,
+            size=0.25,
+        )
+        + p9.geom_hline(yintercept=0, color="white")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=topic_data_long[facet].unique().tolist(),
+            labels=[
+                x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+            ],
+        )
+    )
+
+    if five_is_high:
+        vp = (
+            vp
+            + p9.scale_color_brewer(
+                "div",
+                "RdBu",
+                limits=[1, 2, 3, 4, 5],
+                labels=["\n".join(wrap(x, 15)) for x in labels],
+            )
+            + p9.scale_fill_brewer(
+                "div",
+                "RdBu",
+                limits=[1, 2, 3, 4, 5],
+                labels=["\n".join(wrap(x, 15)) for x in labels],
+            )
+        )
+    else:
+        vp = (
+            vp
+            + reverse_scale_fill_brewer(
+                "div",
+                "RdBu",
+                limits=[-1, -2, -3, -4, -5],
+                labels=["\n".join(wrap(x, 15)) for x in labels],
+            )
+            + reverse_scale_color_brewer(
+                "div",
+                "RdBu",
+                limits=[-1, -2, -3, -4, -5],
+                labels=["\n".join(wrap(x, 15)) for x in labels],
+            )
+        )
+
+    return vp
+
+
+def make_single_bar_chart(
+    survey_data, column, facet, proportionally=False, facet2=None
+):
+    """Make a barchart showing the number of respondents marking 
+        a certain column in the original dataset as True. The facet
+        variable values are used as ticks on the x-axis
+
+    Args:
+        survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey   
+        topic (str): String that all questions of interest start with 
+        facet (str): Column use for grouping
+        proportional (bool, optiona ): Defaults to False. If True,
+            the bars heights are determined proportionally to the 
+            total number of responses in that facet. 
+        facet2 (str, optional): If provided, a second variable to facet against.
+
+    Returns:
+        (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+    """
+    cols = [column, facet]
+    if facet2:
+        cols.append(facet2)
+    show_legend = False
+    topic_data = survey_data[cols]
+
+    grouper = [facet, facet2] if facet2 else facet
+    topic_data_long = make_long(topic_data, grouper)
+
+    if proportionally:
+        proportions = (
+            topic_data_long[topic_data_long.rating == 1].groupby(grouper).sum()
+            / topic_data_long.groupby(grouper).sum()
+        ).reset_index()
+    else:
+        proportions = (
+            topic_data_long[topic_data_long.rating == 1]
+            .groupby(grouper)
+            .count()
+            .reset_index()
+        )
+
+    x = topic_data_long.columns.tolist()
+    x.remove("level_1")
+
+    br = (
+        p9.ggplot(proportions, p9.aes(x=facet, fill=facet, y="level_1"))
+        + p9.geom_bar(show_legend=show_legend, stat="identity")
+        + p9.theme(
+            axis_text_x=p9.element_text(angle=45, ha="right"),
+            strip_text_y=p9.element_text(angle=0, ha="left"),
+        )
+        + p9.scale_x_discrete(
+            limits=topic_data_long[facet].unique().tolist(),
+            labels=[
+                x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+            ],
+        )
+    )
+
+    if facet2:
+        br = br + p9.facet_grid([facet2, "."])
+
+    return br
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py
new file mode 100644
index 00000000..1a9e3aba
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import numpy as np
+
+
+convert_2018_to_2019 = {
+    'Blocker:_Code/Doc_review':'Blocker:_Code/Documentation_review',
+    'Blocker:_GH_tools&processes_(not_our_customized_tooling)': 'Blocker:_GitHub_tools_and_processes_(not_our_customized_tooling)',
+    'Blocker:_Finding_a/the_right_SIG': 'Blocker:_Finding_the_right_SIG_for_your_contributions',
+    'Blocker:_Finding_issues_to_work_on': 'Blocker:_Finding_appropriate_issues_to_work_on',
+    'Blocker:_Setting_up_dev_env': 'Blocker:_Setting_up_development_environment',
+    'Use_freq:_Zoom_Mtgs': 'Use_freq:_Zoom_video_conferencing/meetings',
+    'Use_freq:_GH_(comments,_issues,_prs)': 'Use_freq:_Discussions_on_Github_Issues_and_PRs',
+    'Use_freq:_Unofficial(Twitter,_Reddit,_etc.)':'Use_freq:_Unofficial_channels_(IRC,_WeChat,_etc.)',
+    'Use_freq:_YT_Recordings': 'Use_freq:_YouTube_recordings_(community_meetings,_SIG/WG_meetings,_etc.)',
+    'Use_freq:_GDocs/Forms/Sheets,_etc_(meeting_agendas,_etc)': 'Use_freq:_Google_Docs/Forms/Sheets,_etc_(meeting_agendas,_etc)',
+    'Contribute:_code_to_k/k': 'Contribute:_Core_code_inside_of_kubernetes/kubernetes',
+    'Contribute:_code_in_a_k/*_GH_org': 'Contribute:_Code_inside_of_another_repo_in_the_Kubernetes_GitHub_Org_(example:_/kubernetes-sigs,_kubernetes/website,_etc)',
+    'Contribute:_Docs':'Contribute:_Documentation',
+    'Contribute:_Testing_and_CI':'Contribute:_Testing_&_Infrastructure',
+    'Contribute:_Related_projects_(Kubeadm,_Helm,_container_runtimes,_etc.)': 'Contribute:_Related_projects_(Helm,_container_runtimes,_other_CNCF_projects,_etc.)',
+    'Contribute:_Not_yet': 'Contribute:_Don’t_contribute_yet,_hoping_to_start_soon',
+    'Contribute:_Other': 'Contribute:_Other_(please_specify)',
+    'Level_of_Contributor_Laddor':'Level_of_Contributor',
+    'Most_Important_Proj:_Mentoring_programs':'Most_Important_Proj:_Mentoring_programs_for_all_contributor_levels/roles\xa0(https://git.k8s.io/community/community-membership.md)',
+    'Most_Important_Proj:_GH_Mgmt':'Most_Important_Proj:_GitHub_Management',
+    'Most_Important_Proj:_Contributor_Summits':'Most_Important_Proj:_Delivering_valuable_contributor_summits_at_relevant_events',
+    'Most_Important_Proj:_Keeping_community_safe': 'Most_Important_Proj:_Keeping_our_community_safe_on_our_various_communication_platforms_through_moderation_guidelines_and_new_approaches',
+    'Check_for_news:_k-dev_ML':'Check_for_news:_kubernetes-dev@_mailing_list',
+    'Check_for_news:_discuss.kubernetes.io':'Check_for_news:_Dedicated_discuss.k8s.io_forum_for_contributors',
+    'Check_for_news:_contribex_ML':'Check_for_news:_kubernetes-sig-contribex@\xa0mailing_list',
+    'Check_for_news:_Slack':'Check_for_news:_#kubernetes-dev,_#sig-foo,_#sig-contribex_slack',
+    'Check_for_news:_Twitter_read_first_':'Check_for_news:_@kubernetesio_twitter',
+    'Check_for_news:_Kubernetes_blog_read_first_':'Check_for_news:_Kubernetes_blog',
+    'Check_for_news:_k/community_repo_in_GH_(Issues_and/or_PRs)_read_first':'Check_for_news:_kubernetes/community_repo_in_GitHub_(Issues_and/or_PRs)',
+    'Check_for_news:_Other':'Check_for_news:_Other_(please_specify)',
+    'Attended:_#_of_ContribSummits':'How_many_Kubernetes_Contributor_Summits_have_you_attended',
+    'HelpWanted_&/or_GoodFirstIssue_label_usage':'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors',
+    'Watched_or_participated_in_MoC':'Have_you_watched_or_participated_in_an_episode_of_our_YouTube_mentoring_series_Meet_Our_Contributors_If_you_have_specific_suggestions,_leave_them_at_the_end_of_the_survey.',
+    'Make_project_easier_to_contribute':'Are_there_specific_ways_the_project_could_make_contributing_easier_for_you'
+}
+
+contrib_length_2018_to_2019 = {
+    '1-2 years': 'one to two years',
+    '2-3 years': 'two to three years',
+    '3+ years': 'three+ years',
+    '6 months-1 year':'less than one year',
+    'Just started': 'less than one year'
+}
+
+ladder_level_2018_to_2019 = {
+    "Approver": "approver",
+    "Had no idea this was even a thing": "there's a contributor ladder?",
+    "Org Member": "member",
+    "Reviewer": "reviewer",
+    "I’m not an org member yet, but working on it": "not yet a member but working on it",
+    "Subproject Owner": "subproject owner"
+}
+
+employer_2018_to_2019 = {
+    "It’s complicated": "it's complicated.",
+    "It’s entirely on my own time": "no, I need to use my own time",
+    "Yes, it’s part of my job": "yes, I can contribute on company time",
+    'No, but I’m able to use “free” time at work': "yes, I can contribute on company time"
+}
+
+oss_projects_2018_to_2019 = {
+    'None, Kubernetes is my first one!': 'this is my first open source project!',
+    'One more':'1 other',
+    '2-4' : '2 or more',
+    '4+': '2 or more' 
+}
+
+help_wanted_2018_to_2019 = {
+    "No, because I didn't know they were there": "No",
+    "No, because I don't think my issues qualify": "No",
+    'Not as much as I should because I forget' : "Rarely (for reasons)" 
+}
+
+next_level_interest_2018_2019 = {
+    'Yes, but would like mentorship.': 'if I had help/mentoring/support',
+    'Yes, but not sure I have time.': 'if I had more free time',
+    'Yes, doing it on my own.': 'yes',
+    "No, I'm already an owner": 'no, already a subproject owner (highest level on the ladder)',
+    'Not really': 'no'
+}
+
+def get_df(path):
+
+    survey_data = pd.read_csv(path)
+
+    #Clean Data
+    for x in survey_data.columns:
+        if x.startswith("Useful:"):
+            survey_data = survey_data.assign(**{x: survey_data[x].fillna(0)})
+        if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Most Important Pr"):
+            survey_data = survey_data.assign(**{x: np.where(survey_data[x].isna(),0,1)})
+        if x.startswith('Upstream'):
+            survey_data = survey_data.assign(**{x: survey_data[x].fillna("Didn't Answer")})
+    
+   
+
+    survey_data = survey_data.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in survey_data.columns})
+    
+    survey_data = survey_data.drop('Use_freq:_discuss.kubernetes.io',axis=1)
+
+    x = pd.to_datetime(survey_data.End_Date)
+    survey_data = survey_data.assign(date_taken = x.dt.date)
+    survey_data = survey_data.assign(Contributing_Length = survey_data['Contributing_Length'].apply(contrib_length_2018_to_2019.get))
+   
+    survey_data = survey_data.rename(columns=convert_2018_to_2019)
+
+    survey_data = survey_data.assign(Level_of_Contributor = survey_data['Level_of_Contributor'].apply(lambda x: ladder_level_2018_to_2019.get(x,x)))
+    survey_data = survey_data.assign(Upstream_supported_at_employer = survey_data['Upstream_supported_at_employer'].apply(lambda x: employer_2018_to_2019.get(x,x)))
+    survey_data = survey_data.assign(Interested_in_next_level = survey_data['Interested_in_next_level'].apply(lambda x: next_level_interest_2018_2019.get(x,x) ))
+    survey_data = survey_data.assign(Contribute_to_other_OSS = survey_data['Contribute_to_other_OSS'].apply(lambda x: oss_projects_2018_to_2019.get(x,x)))
+    survey_data.loc[:,'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'] = survey_data['Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'].apply(lambda x: help_wanted_2018_to_2019.get(x,x))
+
+    return survey_data
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py
new file mode 100644
index 00000000..ca8e6787
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py
@@ -0,0 +1,117 @@
+import pandas as pd
+import numpy as np
+
+fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
+
+contribute_header = "What areas of Kubernetes do you contribute to? Please check all that apply."
+blockers_header = "Please rate any challenges to the listed steps of the contribution process"
+agree_header = "Do you agree with the following statements (1 - strongly disagree, 5 - strongly agree):"
+attend_header = "Which of the below would make you likely to attend more of the Community Meetings? Check all that apply."
+most_important_proj_header = "Some of the major projects SIG Contributor Experience is working on are listed below, rank the ones that are most important to you (and/or your SIG)"
+use_freq_header = "Of our various communications channels, please rate which ones you use and/or check most frequently on a 1-5 scale, where 1 is “never”, 3 is “several times a month” and 5 is “every day”."
+news_header = "Which of these channels is most likely to reach you first for news about decisions, changes, additions, and/or announcements to the contributor process or community matters?"
+
+def map_blocker_and_usefreq_vals(val):
+    try:
+        return int(val)
+    except ValueError:
+        return int(val[0])
+
+def process_header(df):
+    columns = list(df.columns)
+    new_columns = [None]*len(columns)
+    for i, col in enumerate(columns):
+        if col[1].startswith("Unnamed") or col[1] == "Response":
+            new_columns[i] = col[0]
+            continue
+
+        # Find the starting column for the multilabel responses (checkboxes)
+        # that were also in the 2018 survey
+        if col[0] == blockers_header:
+            blockers_i = i
+        elif col[0] == contribute_header:
+            contribute_i = i
+        elif col[0] == news_header:
+            news_i = i
+        elif col[0] == use_freq_header:
+            use_freq_i = i
+        elif col[0] == most_important_proj_header:
+            most_important_proj_i = i
+        elif col[0] == agree_header: # Starting columns for multilabel responses that weren't in the 2018 survey.
+            agree_i = i
+        elif col[0] == attend_header:
+            attend_i = i
+        #elif col[0] == unattendance_header:
+        #    unattendance_i = i
+        else: # Handle open ended responses
+            new_columns[i] = col[0]
+
+    def prefix_cols(header, header_i, prefix):
+        i = header_i
+        while i < len(columns) and (columns[i][0].startswith("Unnamed") or columns[i][0] == header):
+            new_columns[i] = "{} {}".format(prefix, columns[i][1])
+            i += 1
+
+    prefix_cols(contribute_header, contribute_i, "Contribute:")
+    prefix_cols(blockers_header, blockers_i, "Blocker:")
+    prefix_cols(news_header, news_i, "Check for news:")
+    prefix_cols(use_freq_header, use_freq_i, "Use freq:")
+    prefix_cols(most_important_proj_header, most_important_proj_i, "Most Important Project:")
+
+    prefix_cols(agree_header, agree_i, "Agree:")
+    prefix_cols(attend_header, attend_i, "Would attend if:")
+
+    df.columns = new_columns
+
+def get_df(file_name=None):
+    fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
+    if file_name:
+        fn = file_name   
+
+    df = pd.read_csv(fn, header=[0,1], skipinitialspace=True)
+    process_header(df)
+
+    df = df.rename(columns={
+        "How long have you been contributing to Kubernetes?": "Contributing_Length",
+        "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
+        "What level of the Contributor Ladder do you consider yourself to be on?  (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
+        "What region of the world are you in?": "World_Region",
+        "Are you interested in advancing to the next level of the Contributor Ladder?": "Interested_in_next_level",
+        "How many other open source projects not in the Kubernetes ecosystem do you contribute to? (example: nodejs, debian)":"Contribute_to_other_OSS",
+        "Does your employer support your contributions to Kubernetes?":"Upstream_supported_at_employer",
+        "Blocker: Other (please specify)": "Other blockers (please specify)",
+        "What region of the world are you in?": "World Region",
+    })
+
+    def map_blocker_and_usefreq_vals(val):
+        try:
+            return int(val)
+        except ValueError:
+            return int(val[0])
+
+    #Clean Data
+    for x in df.columns:
+        if x.startswith("Useful:"):
+            df = df.assign(**{x: df[x].fillna(0)})
+        if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Would attend if:"):
+            df = df.assign(**{x: np.where(df[x].isna(),0,1)})
+        if x.startswith('Upstream'):
+            df = df.assign(**{x: df[x].fillna("Didn't Answer")})
+        if x.startswith("Blocker:") and x != "Blocker: Other (please specify)":
+            df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
+        if x.startswith("Use freq:") or x.startswith("Agree:"):
+            df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
+        
+
+    df = df.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in df.columns})
+
+    x = pd.to_datetime(df.End_Date)
+    df = df.assign(date_taken = x.dt.date)
+
+    return df
+
+# TODO NOTE I should only be dropping these at plot time
+#df.dropna(subset=["Level_of_Contributor",
+#                  "Interested_in_next_level",
+#                  "Upstream_supported_at_employer"], inplace=True)
+
author	Bob Killen <bob.killen@linux.com>	2020-03-30 10:50:35 -0400
committer	Bob Killen <bob.killen@linux.com>	2020-03-30 16:13:53 -0400
commit	1a24e7bb4da9755f2ed3b65a20c9197c5b61ddc4 (patch)
tree	843063b339a82d55a8fb007abe3199a5c14328e8
parent	9bd7de0d8dd8f21606cbad6f77edc6d02e14a42e (diff)