summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Killen <bob.killen@linux.com>2020-03-30 10:50:35 -0400
committerBob Killen <bob.killen@linux.com>2020-03-30 16:13:53 -0400
commit1a24e7bb4da9755f2ed3b65a20c9197c5b61ddc4 (patch)
tree843063b339a82d55a8fb007abe3199a5c14328e8
parent9bd7de0d8dd8f21606cbad6f77edc6d02e14a42e (diff)
Add survey analysis scripts
Co-authored-by: Bryan Wilkinson <bwilkinson@minerkasch.com> Co-authored-by: Bob Killen <bob.killen@linux.com>
-rw-r--r--sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py0
-rw-r--r--sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py1028
-rw-r--r--sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py118
-rw-r--r--sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py117
4 files changed, 1263 insertions, 0 deletions
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py
new file mode 100644
index 00000000..ef2c436b
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py
@@ -0,0 +1,1028 @@
+from textwrap import wrap
+import math
+import plotnine as p9
+import pandas as pd
+import textwrap
+from textwrap import shorten
+from matplotlib import pyplot as plt
+from copy import copy
+
+
+from mizani.palettes import brewer_pal
+from plotnine.scales.scale import scale_discrete
+
+# Custom scales for plotnine that reverse the direction of the colors
+class reverse_scale_color_brewer(p9.scale_color_brewer):
+ def __init__(self, type="seq", palette=1, direction=-1, **kwargs):
+ self.palette = brewer_pal(type, palette, direction)
+ scale_discrete.__init__(self, **kwargs)
+
+
+class reverse_scale_fill_brewer(p9.scale_fill_brewer):
+ def __init__(self, type="seq", palette=1, direction=-1, **kwargs):
+ self.palette = brewer_pal(type, palette, direction)
+ scale_discrete.__init__(self, **kwargs)
+
+
+def split_for_likert(topic_data_long, mid_point):
+ """
+ Returns the aggregated counts for ratings in the top and bottom halves of
+ the of each category, necssary for making offset bar charts
+
+ Args:
+ topic_data_long (pandas.Dataframe): A pandas Dataframe storing each respondents
+ ratings for a given topic, in long format
+ mid_point (int): The midpoint to use to split the into two halves, based on ratings
+
+ Returns:
+ (tuple): Tuple containing:
+ (pandas.DataFrame): Aggregated counts for ratings greater than or equal to the midpoinnt
+ (pandas.DataFrame): Aggregated counts for ratings less than or equal to the midpoinnt
+ """
+ x = topic_data_long.columns.tolist()
+ x.remove("level_1")
+
+ top_cutoff = topic_data_long["rating"] >= mid_point
+ bottom_cutoff = topic_data_long["rating"] <= mid_point
+
+ top_scores = (
+ topic_data_long[top_cutoff]
+ .groupby(x)
+ .count()
+ .reindex(
+ pd.MultiIndex.from_product(
+ [topic_data_long[y].unique().tolist() for y in x], names=x
+ ),
+ fill_value=0,
+ )
+ .reset_index()
+ .sort_index(ascending=False)
+ )
+
+ # The mid point is in both the top and bottom halves, so divide by two
+ top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = (
+ top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0
+ )
+
+ bottom_scores = (
+ topic_data_long[bottom_cutoff]
+ .groupby(x)
+ .count()
+ .reindex(
+ pd.MultiIndex.from_product(
+ [topic_data_long[y].unique().tolist() for y in x], names=x
+ ),
+ fill_value=0,
+ )
+ .reset_index()
+ )
+
+ # The mid point is in both the top and bottom halves, so divide by two
+ bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = (
+ bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0
+ )
+
+ return top_scores, bottom_scores
+
+
+def make_long(data, facets, multi_year=False):
+ """Converts a wide dataframe with columns for each topic's rating into a long dataframe
+
+ Args:
+ data (pandas.DataFrame): A wide dataframe
+ facets (list): List of columns to keep as their own column
+ mulit_year (bool, optional) Defaults to False. If True, add the "year" column to the list of facets
+
+ Returns:
+ (pandas.DataFrame): Long dataframe
+
+ """
+
+ facets = copy(facets)
+ if multi_year:
+ facets.append("year")
+ long_data = data.set_index(facets, append=True).stack().reset_index()
+
+ # Rename so Level_0 always has the values of the topic we are interested in
+ long_data = long_data.rename(
+ columns={
+ "level_0": "level_1",
+ "level_4": "level_0",
+ "level_3": "level_0",
+ "level_2": "level_0",
+ 0: "rating",
+ }
+ )
+ long_data = long_data.assign(
+ level_0=pd.Categorical(long_data.level_0, ordered=True)
+ )
+ return long_data
+
+
+def get_data_subset(
+ survey_data, topic, facets=[], exclude_new_contributors=False, include_year=False
+):
+ """Get only the relevant columns from the data
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facets (list, optional): List of columns use for grouping
+ exclude_new_contributors: (bool, optional) Defaults to False. If True, remove
+ all responses from contributors who have been involved a year or less.
+ include_year: (bool, optional) Defaults to False. If True, include the year column
+ in the output
+
+ Returns:
+ (pandas.DataFrame): Survey dataframe with only columns relevant to the topics
+ and facets remaining.
+ """
+
+ og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+ facets = copy(facets)
+ if include_year:
+ facets.append("year")
+ if facets:
+ if "." in facets:
+ facets.remove(".")
+ cols = og_cols + facets
+ facets.append(".")
+ else:
+ cols = og_cols + facets
+ else:
+ cols = og_cols
+
+ if exclude_new_contributors:
+ topic_data = survey_data[
+ survey_data["Contributing_Length"] != "less than one year"
+ ][cols]
+ else:
+ topic_data = survey_data[cols]
+
+ return topic_data
+
+
+def get_multi_year_data_subset(
+ survey_data, topic, facet_by=[], exclude_new_contributors=False
+):
+ """Get appropriate data for multi-year plots and convert it to long form
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facet_by (list, optional): List of columns use for grouping
+ exclude_new_contributors (bool, optional) Defaults to False. If True, remove
+ all responses from contributors who have been involved a year or less.
+
+ Returns:
+ (pandas.DataFrame): Long dataframe
+ """
+ topic_data = get_data_subset(
+ survey_data, topic, facet_by, exclude_new_contributors, include_year=True
+ )
+
+ if facet_by:
+ if "." in facet_by:
+ facet_by.remove(".")
+ topic_data_long = make_long(topic_data, facet_by, multi_year=True)
+ facet_by.append(".")
+ else:
+ topic_data_long = make_long(topic_data, facet_by, multi_year=True)
+
+ else:
+ topic_data_long = make_long(topic_data, [], multi_year=True)
+
+ return topic_data_long
+
+
+def get_single_year_data_subset(survey_data, topic, facet_by=[]):
+ """Get appropriate data for single-year plots and convert it to long form
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facet_by (list, optional): List of columns use for grouping
+
+ Returns:
+ (pandas.DataFrame): Long dataframe
+
+ """
+ topic_data = get_data_subset(survey_data, topic, facet_by)
+
+ if facet_by:
+ if "." in facet_by:
+ facet_by.remove(".")
+ topic_data_long = make_long(topic_data, facet_by)
+ facet_by.append(".")
+ else:
+ topic_data_long = make_long(topic_data, facet_by)
+ else:
+
+ topic_data_long = (
+ topic_data.unstack().reset_index().rename(columns={0: "rating"})
+ )
+ topic_data_long = topic_data_long.assign(
+ level_0=pd.Categorical(topic_data_long.level_0, ordered=True)
+ )
+
+ return topic_data_long
+
+
+def make_bar_chart_multi_year(
+ survey_data, topic, facet_by=[], exclude_new_contributors=False
+):
+ """Make a barchart showing proportions of respondents listing each
+ column that starts with topic. Bars are colored by which year of
+ the survey they correspond to. If facet_by is not empty, the resulting
+ plot will be faceted into subplots by the variables given.
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facet_by (list,optional): List of columns use for grouping
+ exclude_new_contributors (bool, optiona ): Defaults to False. If True,
+ do not include any responses from contributors with less than
+ one year of experience
+
+ Returns:
+ (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+
+ """
+ topic_data = get_data_subset(
+ survey_data, topic, facet_by, exclude_new_contributors, include_year=True
+ )
+
+ if facet_by:
+ fix = False
+ if "." in facet_by:
+ facet_by.remove(".")
+ fix = True
+ agg = (
+ topic_data.groupby(facet_by + ["year"])
+ .sum()
+ .reset_index()
+ .melt(id_vars=facet_by + ["year"])
+ )
+ totals = (
+ topic_data.groupby(facet_by + ["year"])
+ .count()
+ .reset_index()
+ .melt(id_vars=facet_by + ["year"])
+ )
+ percent = agg.merge(totals, on=facet_by + ["year", "variable"])
+
+ if fix:
+ facet_by.append(".")
+
+ else:
+ agg = topic_data.groupby(["year"]).sum().reset_index().melt(id_vars=["year"])
+ totals = (
+ topic_data.groupby(["year"]).count().reset_index().melt(id_vars=["year"])
+ )
+ percent = agg.merge(totals, on=["year", "variable"])
+
+ # This plot is always done proportionally
+ percent = percent.assign(value=percent["value_x"] / percent["value_y"])
+ percent = percent.assign(variable=pd.Categorical(percent.variable, ordered=True))
+
+ br = (
+ p9.ggplot(percent, p9.aes(x="variable", fill="factor(year)", y="value"))
+ + p9.geom_bar(show_legend=True, position="dodge", stat="identity")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=sorted(percent["variable"].unique().tolist()),
+ labels=[
+ shorten(
+ x.replace(topic, "").replace("_", " "), placeholder="...", width=30
+ )
+ for x in sorted(percent["variable"].unique().tolist())
+ ],
+ )
+ )
+
+ # Uncomment to return dataframe instead of plot
+ # return percent
+
+ if facet_by:
+ br = (
+ br
+ + p9.facet_grid(
+ facet_by,
+ shrink=False,
+ labeller=lambda x: "\n".join(wrap(x.replace("/", "/ "), 15)),
+ )
+ + p9.theme(
+ strip_text_x=p9.element_text(wrap=True, va="bottom", margin={"b": -0.5})
+ )
+ )
+ return br
+
+
+def make_single_bar_chart_multi_year(survey_data, column, facet, proportionally=False):
+ """Make a barchart showing the number of respondents responding to a single column.
+ Bars are colored by which year of the survey they correspond to. If facet
+ is not empty, the resulting plot will be faceted into subplots by the variables
+ given.
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ column (str): Column to plot responses to
+ facet (list,optional): List of columns use for grouping
+ proportionally (bool, optiona ): Defaults to False. If True,
+ the bars heights are determined proportionally to the
+ total number of responses in that facet.
+
+ Returns:
+ (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+
+ """
+ cols = [column, facet]
+ show_legend = False
+ topic_data = survey_data[cols + ["year"]]
+
+ topic_data_long = make_long(topic_data, facet, multi_year=True)
+
+ if proportionally:
+ proportions = (
+ topic_data_long[topic_data_long.rating == 1].groupby(facet + ["year"]).sum()
+ / topic_data_long.groupby(facet + ["year"]).sum()
+ ).reset_index()
+ else:
+ proportions = (
+ topic_data_long[topic_data_long.rating == 1]
+ .groupby(facet + ["year"])
+ .count()
+ .reset_index()
+ )
+
+ x = topic_data_long.columns.tolist()
+ x.remove("level_1")
+
+ ## Uncomment to return dataframe instead of plot
+ # return proportions
+
+ return (
+ p9.ggplot(proportions, p9.aes(x=facet, fill="year", y="level_1"))
+ + p9.geom_bar(show_legend=show_legend, stat="identity")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=topic_data_long[facet].unique().tolist(),
+ labels=[
+ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+ ],
+ )
+ )
+
+
+def make_likert_chart_multi_year(
+ survey_data,
+ topic,
+ labels,
+ facet_by=[],
+ five_is_high=False,
+ exclude_new_contributors=False,
+):
+ """Make an offset stacked barchart showing the number of respondents at each rank or value for
+ all columns in the topic. Each column in the topic is a facet, with the years displayed
+ along the x-axis.
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ labels (list): List of strings to use as labels, corresponding
+ to the numerical values given by the respondents.
+ facet_by (list,optional): List of columns use for grouping
+ five_is_high (bool, optiona ): Defaults to False. If True,
+ five is considered the highest value in a ranking, otherwise
+ it is taken as the lowest value.
+ exclude_new_contributors (bool, optional): Defaults to False. If True,
+ do not include any responses from contributors with less than
+ one year of experience
+
+ Returns:
+ (plotnine.ggplot): Offset stacked barchart plot object which
+ can be displayed in a notebook or saved out to a file
+ """
+
+ facet_by = copy(facet_by)
+ og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+ show_legend = True
+
+ topic_data_long = get_multi_year_data_subset(
+ survey_data, topic, facet_by, exclude_new_contributors
+ )
+
+ if not five_is_high:
+ topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+
+ mid_point = 3 if five_is_high else -3
+ top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)
+
+ if facet_by:
+ fix = False
+ if "." in facet_by:
+ facet_by.remove(".")
+ fix = True
+
+ # Calculate proportion for each rank
+ top_scores = top_scores.merge(
+ topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
+ on=facet_by + ["year"],
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ top_scores = top_scores.assign(
+ level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+ )
+
+ bottom_scores = bottom_scores.merge(
+ topic_data_long.groupby(facet_by + ["year"]).count().reset_index(),
+ on=facet_by + ["year"],
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ bottom_scores = bottom_scores.assign(
+ level_1=bottom_scores.level_1_x
+ * -1
+ / (bottom_scores.level_1_y / len(og_cols))
+ )
+
+ if fix:
+ facet_by.append(".")
+ else:
+ # Calculate proportion for each rank
+ top_scores = top_scores.merge(
+ topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ top_scores = top_scores.assign(
+ level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+ )
+
+ bottom_scores = bottom_scores.merge(
+ topic_data_long.groupby(["year"]).count().reset_index(), on=["year"]
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ bottom_scores = bottom_scores.assign(
+ level_1=bottom_scores.level_1_x
+ * -1
+ / (bottom_scores.level_1_y / len(og_cols))
+ )
+
+ vp = (
+ p9.ggplot(
+ topic_data_long,
+ p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"),
+ )
+ + p9.geom_col(
+ data=top_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ position=p9.position_stack(reverse=True),
+ )
+ + p9.geom_col(
+ data=bottom_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ position=p9.position_stack(),
+ )
+ + p9.geom_hline(yintercept=0, color="white")
+ )
+
+ if five_is_high:
+ vp = (
+ vp
+ + p9.scale_color_brewer(
+ "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels
+ )
+ + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels)
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ )
+ else:
+ vp = (
+ vp
+ + p9.scale_color_brewer(
+ "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
+ )
+ + p9.scale_fill_brewer(
+ "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels
+ )
+ + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left"))
+ )
+
+ if facet_by:
+ facet_by.remove(".")
+
+ else:
+ facet_by.append(".")
+
+ vp = (
+ vp
+ + p9.facet_grid(
+ facet_by + ["level_0"],
+ labeller=lambda x: "\n".join(
+ wrap(
+ x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(),
+ 15,
+ )
+ ),
+ )
+ + p9.theme(
+ strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1
+ )
+ )
+
+ return vp
+
+
+def make_bar_chart(survey_data, topic, facet_by=[], proportional=False):
+ """Make a barchart showing the number of respondents listing each
+ column that starts with topic for a single year. If facet_by is
+ not empty, the resulting plot will be faceted into subplots
+ by the variables given.
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facet_by (list,optional): List of columns use for grouping
+ proportional (bool, optiona ): Defaults to False. If True,
+ the bars heights are determined proportionally to the
+ total number of responses in that facet.
+
+ Returns:
+ (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+ """
+ show_legend = False
+ if facet_by:
+ show_legend = True
+
+ topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by)
+
+ x = topic_data_long.columns.tolist()
+ x.remove("level_1")
+
+ if facet_by:
+ period = False
+ if "." in facet_by:
+ facet_by.remove(".")
+ period = True
+
+ aggregate_data = (
+ topic_data_long[topic_data_long.rating == 1]
+ .dropna()
+ .groupby(["level_0"] + facet_by)
+ .count()
+ .reset_index()
+ )
+
+ if period:
+ facet_by.append(".")
+
+ else:
+ aggregate_data = (
+ topic_data_long[topic_data_long.rating == 1]
+ .dropna()
+ .groupby("level_0")
+ .count()
+ .reset_index()
+ )
+
+ if proportional and facet_by:
+ period = False
+ if "." in facet_by:
+ facet_by.remove(".")
+ period = True
+
+ facet_sums = (
+ topic_data_long[topic_data_long.rating == 1]
+ .dropna()
+ .groupby(facet_by)
+ .count()
+ .reset_index()
+ )
+
+ aggregate_data = aggregate_data.merge(facet_sums, on=facet_by).rename(
+ columns={"level_0_x": "level_0"}
+ )
+ aggregate_data = aggregate_data.assign(
+ rating=aggregate_data.rating_x / aggregate_data.rating_y
+ )
+
+ if period:
+ facet_by.append(".")
+
+ br = (
+ p9.ggplot(aggregate_data, p9.aes(x="level_0", fill="level_0", y="rating"))
+ + p9.geom_bar(show_legend=show_legend, stat="identity")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=topic_data_long["level_0"].unique().tolist(),
+ labels=[
+ "\n".join(
+ textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2]
+ )
+ for x in topic_data_long["level_0"].unique().tolist()
+ ],
+ )
+ )
+
+ if facet_by:
+ br = (
+ br
+ + p9.facet_grid(
+ facet_by, shrink=False, labeller=lambda x: "\n".join(wrap(x, 15))
+ )
+ + p9.theme(
+ axis_text_x=p9.element_blank(),
+ strip_text_x=p9.element_text(
+ wrap=True, va="bottom", margin={"b": -0.5}
+ ),
+ )
+ + p9.scale_fill_discrete(
+ limits=topic_data_long["level_0"].unique().tolist(),
+ labels=[
+ "\n".join(
+ wrap(
+ x.replace(topic, "")
+ .replace("_", " ")
+ .replace("/", "/ ")
+ .strip(),
+ 30,
+ )
+ )
+ for x in topic_data_long["level_0"].unique().tolist()
+ ],
+ )
+ )
+ return br
+
+
+def make_likert_chart(
+ survey_data,
+ topic,
+ labels,
+ facet_by=[],
+ max_value=5,
+ max_is_high=False,
+ wrap_facets=True,
+ sort_x=False,
+):
+ """Make an offset stacked barchart showing the number of respondents at each rank or value for
+ all columns in the topic. Each column in the original data is a tick on the x-axis
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ labels (list): List of strings to use as labels, corresponding
+ to the numerical values given by the respondents.
+ facet_by (list,optional): List of columns use for grouping
+ max_value (int, optional): Defaults to 5. The maximuum value a respondent can assign.
+ max_is_high (bool, optiona ): Defaults to False. If True,
+ the max_value is considered the highest value in a ranking, otherwise
+ it is taken as the lowest value.
+ wrap_facets (bool, optional): Defaults to True. If True, the facet labels are
+ wrapped
+ sort_x (bool, optional): Defaults to False. If True, the x-axis is sorted by the
+ mean value for each column in the original data
+
+ Returns:
+ (plotnine.ggplot): Offset stacked barchart plot object which
+ can be displayed in a notebook or saved out to a file
+ """
+
+ mid_point = math.ceil(max_value / 2)
+
+ og_cols = [x for x in survey_data.columns if x.startswith(topic)]
+ show_legend = True
+
+ topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by)
+
+ if not max_is_high:
+ topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+
+ mid_point = -1 * mid_point
+
+ top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point)
+
+ if facet_by:
+ fix = False
+ if "." in facet_by:
+ facet_by.remove(".")
+ fix = True
+
+ top_scores = top_scores.merge(
+ topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ top_scores = top_scores.assign(
+ level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols))
+ )
+
+ bottom_scores = bottom_scores.merge(
+ topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by
+ ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"})
+ bottom_scores = bottom_scores.assign(
+ level_1=bottom_scores.level_1_x
+ * -1
+ / (bottom_scores.level_1_y / len(og_cols))
+ )
+
+ if fix:
+ facet_by.append(".")
+
+ else:
+ bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1)
+
+ if sort_x:
+ x_sort_order = (
+ topic_data_long.groupby("level_0")
+ .mean()
+ .sort_values("rating")
+ .reset_index()["level_0"]
+ .values.tolist()
+ )
+ x_sort_order.reverse()
+ else:
+ x_sort_order = topic_data_long["level_0"].unique().tolist()
+
+ vp = (
+ p9.ggplot(
+ topic_data_long,
+ p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"),
+ )
+ + p9.geom_col(
+ data=top_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ position=p9.position_stack(reverse=True),
+ )
+ + p9.geom_col(
+ data=bottom_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ position=p9.position_stack(),
+ )
+ + p9.geom_hline(yintercept=0, color="white")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=x_sort_order,
+ labels=[
+ "\n".join(
+ textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2]
+ )
+ for x in x_sort_order
+ ],
+ )
+ )
+
+ if max_is_high:
+ vp = (
+ vp
+ + p9.scale_color_brewer(
+ "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
+ )
+ + p9.scale_fill_brewer(
+ "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels
+ )
+ )
+
+ else:
+ vp = (
+ vp
+ + reverse_scale_fill_brewer(
+ "div",
+ "RdBu",
+ limits=list(reversed(range(-max_value, 0))),
+ labels=labels,
+ )
+ + reverse_scale_color_brewer(
+ "div",
+ "RdBu",
+ limits=list(reversed(range(-max_value, 0))),
+ labels=labels,
+ )
+ )
+
+ if facet_by:
+ if wrap_facets:
+ vp = (
+ vp
+ + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15)))
+ + p9.theme(
+ strip_text_x=p9.element_text(
+ wrap=True, va="bottom", margin={"b": -0.5}
+ )
+ )
+ )
+ else:
+ vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x)
+ return vp
+
+
+def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False):
+ """Make an offset stacked barchart showing the number of respondents at each rank
+ or value for a single columns in the original data. Each facet is shown as
+ a tick on the x-axis
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ labels (list): List of strings to use as labels, corresponding
+ to the numerical values given by the respondents.
+ facet (str): Column used for grouping
+ five_is_high (bool, optionalc): Defaults to False. If True,
+ 5 is considered the highest value in a ranking, otherwise
+ it is taken as the lowest value.
+
+ Returns:
+ (plotnine.ggplot): Offset stacked barchart plot object which
+ can be displayed in a notebook or saved out to a file
+ """
+ mid_point = 3
+ cols = [column, facet]
+ show_legend = True
+ topic_data = survey_data[cols]
+
+ topic_data_long = make_long(topic_data, facet)
+
+ if not five_is_high:
+ topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0)
+ x = topic_data_long.columns.tolist()
+ x.remove("level_1")
+ x.remove("level_0")
+
+ if not five_is_high:
+ mid_point *= -1
+
+ top_cutoff = topic_data_long["rating"] >= mid_point
+ bottom_cutoff = topic_data_long["rating"] <= mid_point
+
+ top_scores = (
+ topic_data_long[top_cutoff]
+ .groupby(x)
+ .count()
+ .reset_index()
+ .sort_index(ascending=False)
+ )
+
+ top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = (
+ top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0
+ )
+ top_scores = top_scores.merge(
+ topic_data_long.groupby(facet).count().reset_index(), on=facet
+ )
+ top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y)
+
+ bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index()
+ bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = (
+ bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0
+ )
+ bottom_scores = bottom_scores.merge(
+ topic_data_long.groupby(facet).count().reset_index(), on=facet
+ )
+ bottom_scores = bottom_scores.assign(
+ level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y
+ )
+
+ vp = (
+ p9.ggplot(
+ topic_data_long,
+ p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"),
+ )
+ + p9.geom_col(
+ data=top_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ position=p9.position_stack(reverse=True),
+ )
+ + p9.geom_col(
+ data=bottom_scores,
+ mapping=p9.aes(y="level_1"),
+ show_legend=show_legend,
+ size=0.25,
+ )
+ + p9.geom_hline(yintercept=0, color="white")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=topic_data_long[facet].unique().tolist(),
+ labels=[
+ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+ ],
+ )
+ )
+
+ if five_is_high:
+ vp = (
+ vp
+ + p9.scale_color_brewer(
+ "div",
+ "RdBu",
+ limits=[1, 2, 3, 4, 5],
+ labels=["\n".join(wrap(x, 15)) for x in labels],
+ )
+ + p9.scale_fill_brewer(
+ "div",
+ "RdBu",
+ limits=[1, 2, 3, 4, 5],
+ labels=["\n".join(wrap(x, 15)) for x in labels],
+ )
+ )
+ else:
+ vp = (
+ vp
+ + reverse_scale_fill_brewer(
+ "div",
+ "RdBu",
+ limits=[-1, -2, -3, -4, -5],
+ labels=["\n".join(wrap(x, 15)) for x in labels],
+ )
+ + reverse_scale_color_brewer(
+ "div",
+ "RdBu",
+ limits=[-1, -2, -3, -4, -5],
+ labels=["\n".join(wrap(x, 15)) for x in labels],
+ )
+ )
+
+ return vp
+
+
+def make_single_bar_chart(
+ survey_data, column, facet, proportionally=False, facet2=None
+):
+ """Make a barchart showing the number of respondents marking
+ a certain column in the original dataset as True. The facet
+ variable values are used as ticks on the x-axis
+
+ Args:
+ survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey
+ topic (str): String that all questions of interest start with
+ facet (str): Column use for grouping
+ proportional (bool, optiona ): Defaults to False. If True,
+ the bars heights are determined proportionally to the
+ total number of responses in that facet.
+ facet2 (str, optional): If provided, a second variable to facet against.
+
+ Returns:
+ (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file
+ """
+ cols = [column, facet]
+ if facet2:
+ cols.append(facet2)
+ show_legend = False
+ topic_data = survey_data[cols]
+
+ grouper = [facet, facet2] if facet2 else facet
+ topic_data_long = make_long(topic_data, grouper)
+
+ if proportionally:
+ proportions = (
+ topic_data_long[topic_data_long.rating == 1].groupby(grouper).sum()
+ / topic_data_long.groupby(grouper).sum()
+ ).reset_index()
+ else:
+ proportions = (
+ topic_data_long[topic_data_long.rating == 1]
+ .groupby(grouper)
+ .count()
+ .reset_index()
+ )
+
+ x = topic_data_long.columns.tolist()
+ x.remove("level_1")
+
+ br = (
+ p9.ggplot(proportions, p9.aes(x=facet, fill=facet, y="level_1"))
+ + p9.geom_bar(show_legend=show_legend, stat="identity")
+ + p9.theme(
+ axis_text_x=p9.element_text(angle=45, ha="right"),
+ strip_text_y=p9.element_text(angle=0, ha="left"),
+ )
+ + p9.scale_x_discrete(
+ limits=topic_data_long[facet].unique().tolist(),
+ labels=[
+ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist()
+ ],
+ )
+ )
+
+ if facet2:
+ br = br + p9.facet_grid([facet2, "."])
+
+ return br
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py
new file mode 100644
index 00000000..1a9e3aba
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import numpy as np
+
+
+convert_2018_to_2019 = {
+ 'Blocker:_Code/Doc_review':'Blocker:_Code/Documentation_review',
+ 'Blocker:_GH_tools&processes_(not_our_customized_tooling)': 'Blocker:_GitHub_tools_and_processes_(not_our_customized_tooling)',
+ 'Blocker:_Finding_a/the_right_SIG': 'Blocker:_Finding_the_right_SIG_for_your_contributions',
+ 'Blocker:_Finding_issues_to_work_on': 'Blocker:_Finding_appropriate_issues_to_work_on',
+ 'Blocker:_Setting_up_dev_env': 'Blocker:_Setting_up_development_environment',
+ 'Use_freq:_Zoom_Mtgs': 'Use_freq:_Zoom_video_conferencing/meetings',
+ 'Use_freq:_GH_(comments,_issues,_prs)': 'Use_freq:_Discussions_on_Github_Issues_and_PRs',
+ 'Use_freq:_Unofficial(Twitter,_Reddit,_etc.)':'Use_freq:_Unofficial_channels_(IRC,_WeChat,_etc.)',
+ 'Use_freq:_YT_Recordings': 'Use_freq:_YouTube_recordings_(community_meetings,_SIG/WG_meetings,_etc.)',
+ 'Use_freq:_GDocs/Forms/Sheets,_etc_(meeting_agendas,_etc)': 'Use_freq:_Google_Docs/Forms/Sheets,_etc_(meeting_agendas,_etc)',
+ 'Contribute:_code_to_k/k': 'Contribute:_Core_code_inside_of_kubernetes/kubernetes',
+ 'Contribute:_code_in_a_k/*_GH_org': 'Contribute:_Code_inside_of_another_repo_in_the_Kubernetes_GitHub_Org_(example:_/kubernetes-sigs,_kubernetes/website,_etc)',
+ 'Contribute:_Docs':'Contribute:_Documentation',
+ 'Contribute:_Testing_and_CI':'Contribute:_Testing_&_Infrastructure',
+ 'Contribute:_Related_projects_(Kubeadm,_Helm,_container_runtimes,_etc.)': 'Contribute:_Related_projects_(Helm,_container_runtimes,_other_CNCF_projects,_etc.)',
+ 'Contribute:_Not_yet': 'Contribute:_Don’t_contribute_yet,_hoping_to_start_soon',
+ 'Contribute:_Other': 'Contribute:_Other_(please_specify)',
+ 'Level_of_Contributor_Laddor':'Level_of_Contributor',
+ 'Most_Important_Proj:_Mentoring_programs':'Most_Important_Proj:_Mentoring_programs_for_all_contributor_levels/roles\xa0(https://git.k8s.io/community/community-membership.md)',
+ 'Most_Important_Proj:_GH_Mgmt':'Most_Important_Proj:_GitHub_Management',
+ 'Most_Important_Proj:_Contributor_Summits':'Most_Important_Proj:_Delivering_valuable_contributor_summits_at_relevant_events',
+ 'Most_Important_Proj:_Keeping_community_safe': 'Most_Important_Proj:_Keeping_our_community_safe_on_our_various_communication_platforms_through_moderation_guidelines_and_new_approaches',
+ 'Check_for_news:_k-dev_ML':'Check_for_news:_kubernetes-dev@_mailing_list',
+ 'Check_for_news:_discuss.kubernetes.io':'Check_for_news:_Dedicated_discuss.k8s.io_forum_for_contributors',
+ 'Check_for_news:_contribex_ML':'Check_for_news:_kubernetes-sig-contribex@\xa0mailing_list',
+ 'Check_for_news:_Slack':'Check_for_news:_#kubernetes-dev,_#sig-foo,_#sig-contribex_slack',
+ 'Check_for_news:_Twitter_read_first_':'Check_for_news:_@kubernetesio_twitter',
+ 'Check_for_news:_Kubernetes_blog_read_first_':'Check_for_news:_Kubernetes_blog',
+ 'Check_for_news:_k/community_repo_in_GH_(Issues_and/or_PRs)_read_first':'Check_for_news:_kubernetes/community_repo_in_GitHub_(Issues_and/or_PRs)',
+ 'Check_for_news:_Other':'Check_for_news:_Other_(please_specify)',
+ 'Attended:_#_of_ContribSummits':'How_many_Kubernetes_Contributor_Summits_have_you_attended',
+ 'HelpWanted_&/or_GoodFirstIssue_label_usage':'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors',
+ 'Watched_or_participated_in_MoC':'Have_you_watched_or_participated_in_an_episode_of_our_YouTube_mentoring_series_Meet_Our_Contributors_If_you_have_specific_suggestions,_leave_them_at_the_end_of_the_survey.',
+ 'Make_project_easier_to_contribute':'Are_there_specific_ways_the_project_could_make_contributing_easier_for_you'
+}
+
+contrib_length_2018_to_2019 = {
+ '1-2 years': 'one to two years',
+ '2-3 years': 'two to three years',
+ '3+ years': 'three+ years',
+ '6 months-1 year':'less than one year',
+ 'Just started': 'less than one year'
+}
+
+ladder_level_2018_to_2019 = {
+ "Approver": "approver",
+ "Had no idea this was even a thing": "there's a contributor ladder?",
+ "Org Member": "member",
+ "Reviewer": "reviewer",
+ "I’m not an org member yet, but working on it": "not yet a member but working on it",
+ "Subproject Owner": "subproject owner"
+}
+
+employer_2018_to_2019 = {
+ "It’s complicated": "it's complicated.",
+ "It’s entirely on my own time": "no, I need to use my own time",
+ "Yes, it’s part of my job": "yes, I can contribute on company time",
+ 'No, but I’m able to use “free” time at work': "yes, I can contribute on company time"
+}
+
+oss_projects_2018_to_2019 = {
+ 'None, Kubernetes is my first one!': 'this is my first open source project!',
+ 'One more':'1 other',
+ '2-4' : '2 or more',
+ '4+': '2 or more'
+}
+
+help_wanted_2018_to_2019 = {
+ "No, because I didn't know they were there": "No",
+ "No, because I don't think my issues qualify": "No",
+ 'Not as much as I should because I forget' : "Rarely (for reasons)"
+}
+
+next_level_interest_2018_2019 = {
+ 'Yes, but would like mentorship.': 'if I had help/mentoring/support',
+ 'Yes, but not sure I have time.': 'if I had more free time',
+ 'Yes, doing it on my own.': 'yes',
+ "No, I'm already an owner": 'no, already a subproject owner (highest level on the ladder)',
+ 'Not really': 'no'
+}
+
+def get_df(path):
+
+ survey_data = pd.read_csv(path)
+
+ #Clean Data
+ for x in survey_data.columns:
+ if x.startswith("Useful:"):
+ survey_data = survey_data.assign(**{x: survey_data[x].fillna(0)})
+ if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Most Important Pr"):
+ survey_data = survey_data.assign(**{x: np.where(survey_data[x].isna(),0,1)})
+ if x.startswith('Upstream'):
+ survey_data = survey_data.assign(**{x: survey_data[x].fillna("Didn't Answer")})
+
+
+
+ survey_data = survey_data.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in survey_data.columns})
+
+ survey_data = survey_data.drop('Use_freq:_discuss.kubernetes.io',axis=1)
+
+ x = pd.to_datetime(survey_data.End_Date)
+ survey_data = survey_data.assign(date_taken = x.dt.date)
+ survey_data = survey_data.assign(Contributing_Length = survey_data['Contributing_Length'].apply(contrib_length_2018_to_2019.get))
+
+ survey_data = survey_data.rename(columns=convert_2018_to_2019)
+
+ survey_data = survey_data.assign(Level_of_Contributor = survey_data['Level_of_Contributor'].apply(lambda x: ladder_level_2018_to_2019.get(x,x)))
+ survey_data = survey_data.assign(Upstream_supported_at_employer = survey_data['Upstream_supported_at_employer'].apply(lambda x: employer_2018_to_2019.get(x,x)))
+ survey_data = survey_data.assign(Interested_in_next_level = survey_data['Interested_in_next_level'].apply(lambda x: next_level_interest_2018_2019.get(x,x) ))
+ survey_data = survey_data.assign(Contribute_to_other_OSS = survey_data['Contribute_to_other_OSS'].apply(lambda x: oss_projects_2018_to_2019.get(x,x)))
+ survey_data.loc[:,'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'] = survey_data['Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'].apply(lambda x: help_wanted_2018_to_2019.get(x,x))
+
+ return survey_data
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py
new file mode 100644
index 00000000..ca8e6787
--- /dev/null
+++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py
@@ -0,0 +1,117 @@
+import pandas as pd
+import numpy as np
+
+fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
+
+contribute_header = "What areas of Kubernetes do you contribute to? Please check all that apply."
+blockers_header = "Please rate any challenges to the listed steps of the contribution process"
+agree_header = "Do you agree with the following statements (1 - strongly disagree, 5 - strongly agree):"
+attend_header = "Which of the below would make you likely to attend more of the Community Meetings? Check all that apply."
+most_important_proj_header = "Some of the major projects SIG Contributor Experience is working on are listed below, rank the ones that are most important to you (and/or your SIG)"
+use_freq_header = "Of our various communications channels, please rate which ones you use and/or check most frequently on a 1-5 scale, where 1 is “never”, 3 is “several times a month” and 5 is “every day”."
+news_header = "Which of these channels is most likely to reach you first for news about decisions, changes, additions, and/or announcements to the contributor process or community matters?"
+
+def map_blocker_and_usefreq_vals(val):
+ try:
+ return int(val)
+ except ValueError:
+ return int(val[0])
+
+def process_header(df):
+ columns = list(df.columns)
+ new_columns = [None]*len(columns)
+ for i, col in enumerate(columns):
+ if col[1].startswith("Unnamed") or col[1] == "Response":
+ new_columns[i] = col[0]
+ continue
+
+ # Find the starting column for the multilabel responses (checkboxes)
+ # that were also in the 2018 survey
+ if col[0] == blockers_header:
+ blockers_i = i
+ elif col[0] == contribute_header:
+ contribute_i = i
+ elif col[0] == news_header:
+ news_i = i
+ elif col[0] == use_freq_header:
+ use_freq_i = i
+ elif col[0] == most_important_proj_header:
+ most_important_proj_i = i
+ elif col[0] == agree_header: # Starting columns for multilabel responses that weren't in the 2018 survey.
+ agree_i = i
+ elif col[0] == attend_header:
+ attend_i = i
+ #elif col[0] == unattendance_header:
+ # unattendance_i = i
+ else: # Handle open ended responses
+ new_columns[i] = col[0]
+
+ def prefix_cols(header, header_i, prefix):
+ i = header_i
+ while i < len(columns) and (columns[i][0].startswith("Unnamed") or columns[i][0] == header):
+ new_columns[i] = "{} {}".format(prefix, columns[i][1])
+ i += 1
+
+ prefix_cols(contribute_header, contribute_i, "Contribute:")
+ prefix_cols(blockers_header, blockers_i, "Blocker:")
+ prefix_cols(news_header, news_i, "Check for news:")
+ prefix_cols(use_freq_header, use_freq_i, "Use freq:")
+ prefix_cols(most_important_proj_header, most_important_proj_i, "Most Important Project:")
+
+ prefix_cols(agree_header, agree_i, "Agree:")
+ prefix_cols(attend_header, attend_i, "Would attend if:")
+
+ df.columns = new_columns
+
+def get_df(file_name=None):
+ fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
+ if file_name:
+ fn = file_name
+
+ df = pd.read_csv(fn, header=[0,1], skipinitialspace=True)
+ process_header(df)
+
+ df = df.rename(columns={
+ "How long have you been contributing to Kubernetes?": "Contributing_Length",
+ "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
+ "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
+ "What region of the world are you in?": "World_Region",
+ "Are you interested in advancing to the next level of the Contributor Ladder?": "Interested_in_next_level",
+ "How many other open source projects not in the Kubernetes ecosystem do you contribute to? (example: nodejs, debian)":"Contribute_to_other_OSS",
+ "Does your employer support your contributions to Kubernetes?":"Upstream_supported_at_employer",
+ "Blocker: Other (please specify)": "Other blockers (please specify)",
+ "What region of the world are you in?": "World Region",
+ })
+
+ def map_blocker_and_usefreq_vals(val):
+ try:
+ return int(val)
+ except ValueError:
+ return int(val[0])
+
+ #Clean Data
+ for x in df.columns:
+ if x.startswith("Useful:"):
+ df = df.assign(**{x: df[x].fillna(0)})
+ if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Would attend if:"):
+ df = df.assign(**{x: np.where(df[x].isna(),0,1)})
+ if x.startswith('Upstream'):
+ df = df.assign(**{x: df[x].fillna("Didn't Answer")})
+ if x.startswith("Blocker:") and x != "Blocker: Other (please specify)":
+ df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
+ if x.startswith("Use freq:") or x.startswith("Agree:"):
+ df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
+
+
+ df = df.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in df.columns})
+
+ x = pd.to_datetime(df.End_Date)
+ df = df.assign(date_taken = x.dt.date)
+
+ return df
+
+# TODO NOTE I should only be dropping these at plot time
+#df.dropna(subset=["Level_of_Contributor",
+# "Interested_in_next_level",
+# "Upstream_supported_at_employer"], inplace=True)
+