diff options
| author | Bob Killen <bob.killen@linux.com> | 2020-03-30 10:50:35 -0400 |
|---|---|---|
| committer | Bob Killen <bob.killen@linux.com> | 2020-03-30 16:13:53 -0400 |
| commit | 1a24e7bb4da9755f2ed3b65a20c9197c5b61ddc4 (patch) | |
| tree | 843063b339a82d55a8fb007abe3199a5c14328e8 | |
| parent | 9bd7de0d8dd8f21606cbad6f77edc6d02e14a42e (diff) | |
Add survey analysis scripts
Co-authored-by: Bryan Wilkinson <bwilkinson@minerkasch.com>
Co-authored-by: Bob Killen <bob.killen@linux.com>
4 files changed, 1263 insertions, 0 deletions
diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/sig-contributor-experience/surveys/k8s_survey_analysis/__init__.py diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py new file mode 100644 index 00000000..ef2c436b --- /dev/null +++ b/sig-contributor-experience/surveys/k8s_survey_analysis/plot_utils.py @@ -0,0 +1,1028 @@ +from textwrap import wrap +import math +import plotnine as p9 +import pandas as pd +import textwrap +from textwrap import shorten +from matplotlib import pyplot as plt +from copy import copy + + +from mizani.palettes import brewer_pal +from plotnine.scales.scale import scale_discrete + +# Custom scales for plotnine that reverse the direction of the colors +class reverse_scale_color_brewer(p9.scale_color_brewer): + def __init__(self, type="seq", palette=1, direction=-1, **kwargs): + self.palette = brewer_pal(type, palette, direction) + scale_discrete.__init__(self, **kwargs) + + +class reverse_scale_fill_brewer(p9.scale_fill_brewer): + def __init__(self, type="seq", palette=1, direction=-1, **kwargs): + self.palette = brewer_pal(type, palette, direction) + scale_discrete.__init__(self, **kwargs) + + +def split_for_likert(topic_data_long, mid_point): + """ + Returns the aggregated counts for ratings in the top and bottom halves of + the of each category, necssary for making offset bar charts + + Args: + topic_data_long (pandas.Dataframe): A pandas Dataframe storing each respondents + ratings for a given topic, in long format + mid_point (int): The midpoint to use to split the into two halves, based on ratings + + Returns: + (tuple): Tuple containing: + (pandas.DataFrame): Aggregated counts for ratings greater than or equal to the midpoinnt + (pandas.DataFrame): Aggregated counts for ratings less than or equal to the midpoinnt + """ + x = topic_data_long.columns.tolist() + x.remove("level_1") + + top_cutoff = topic_data_long["rating"] >= mid_point + bottom_cutoff = topic_data_long["rating"] <= mid_point + + top_scores = ( + topic_data_long[top_cutoff] + .groupby(x) + .count() + .reindex( + pd.MultiIndex.from_product( + [topic_data_long[y].unique().tolist() for y in x], names=x + ), + fill_value=0, + ) + .reset_index() + .sort_index(ascending=False) + ) + + # The mid point is in both the top and bottom halves, so divide by two + top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = ( + top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0 + ) + + bottom_scores = ( + topic_data_long[bottom_cutoff] + .groupby(x) + .count() + .reindex( + pd.MultiIndex.from_product( + [topic_data_long[y].unique().tolist() for y in x], names=x + ), + fill_value=0, + ) + .reset_index() + ) + + # The mid point is in both the top and bottom halves, so divide by two + bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = ( + bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0 + ) + + return top_scores, bottom_scores + + +def make_long(data, facets, multi_year=False): + """Converts a wide dataframe with columns for each topic's rating into a long dataframe + + Args: + data (pandas.DataFrame): A wide dataframe + facets (list): List of columns to keep as their own column + mulit_year (bool, optional) Defaults to False. If True, add the "year" column to the list of facets + + Returns: + (pandas.DataFrame): Long dataframe + + """ + + facets = copy(facets) + if multi_year: + facets.append("year") + long_data = data.set_index(facets, append=True).stack().reset_index() + + # Rename so Level_0 always has the values of the topic we are interested in + long_data = long_data.rename( + columns={ + "level_0": "level_1", + "level_4": "level_0", + "level_3": "level_0", + "level_2": "level_0", + 0: "rating", + } + ) + long_data = long_data.assign( + level_0=pd.Categorical(long_data.level_0, ordered=True) + ) + return long_data + + +def get_data_subset( + survey_data, topic, facets=[], exclude_new_contributors=False, include_year=False +): + """Get only the relevant columns from the data + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facets (list, optional): List of columns use for grouping + exclude_new_contributors: (bool, optional) Defaults to False. If True, remove + all responses from contributors who have been involved a year or less. + include_year: (bool, optional) Defaults to False. If True, include the year column + in the output + + Returns: + (pandas.DataFrame): Survey dataframe with only columns relevant to the topics + and facets remaining. + """ + + og_cols = [x for x in survey_data.columns if x.startswith(topic)] + facets = copy(facets) + if include_year: + facets.append("year") + if facets: + if "." in facets: + facets.remove(".") + cols = og_cols + facets + facets.append(".") + else: + cols = og_cols + facets + else: + cols = og_cols + + if exclude_new_contributors: + topic_data = survey_data[ + survey_data["Contributing_Length"] != "less than one year" + ][cols] + else: + topic_data = survey_data[cols] + + return topic_data + + +def get_multi_year_data_subset( + survey_data, topic, facet_by=[], exclude_new_contributors=False +): + """Get appropriate data for multi-year plots and convert it to long form + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facet_by (list, optional): List of columns use for grouping + exclude_new_contributors (bool, optional) Defaults to False. If True, remove + all responses from contributors who have been involved a year or less. + + Returns: + (pandas.DataFrame): Long dataframe + """ + topic_data = get_data_subset( + survey_data, topic, facet_by, exclude_new_contributors, include_year=True + ) + + if facet_by: + if "." in facet_by: + facet_by.remove(".") + topic_data_long = make_long(topic_data, facet_by, multi_year=True) + facet_by.append(".") + else: + topic_data_long = make_long(topic_data, facet_by, multi_year=True) + + else: + topic_data_long = make_long(topic_data, [], multi_year=True) + + return topic_data_long + + +def get_single_year_data_subset(survey_data, topic, facet_by=[]): + """Get appropriate data for single-year plots and convert it to long form + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facet_by (list, optional): List of columns use for grouping + + Returns: + (pandas.DataFrame): Long dataframe + + """ + topic_data = get_data_subset(survey_data, topic, facet_by) + + if facet_by: + if "." in facet_by: + facet_by.remove(".") + topic_data_long = make_long(topic_data, facet_by) + facet_by.append(".") + else: + topic_data_long = make_long(topic_data, facet_by) + else: + + topic_data_long = ( + topic_data.unstack().reset_index().rename(columns={0: "rating"}) + ) + topic_data_long = topic_data_long.assign( + level_0=pd.Categorical(topic_data_long.level_0, ordered=True) + ) + + return topic_data_long + + +def make_bar_chart_multi_year( + survey_data, topic, facet_by=[], exclude_new_contributors=False +): + """Make a barchart showing proportions of respondents listing each + column that starts with topic. Bars are colored by which year of + the survey they correspond to. If facet_by is not empty, the resulting + plot will be faceted into subplots by the variables given. + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facet_by (list,optional): List of columns use for grouping + exclude_new_contributors (bool, optiona ): Defaults to False. If True, + do not include any responses from contributors with less than + one year of experience + + Returns: + (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file + + """ + topic_data = get_data_subset( + survey_data, topic, facet_by, exclude_new_contributors, include_year=True + ) + + if facet_by: + fix = False + if "." in facet_by: + facet_by.remove(".") + fix = True + agg = ( + topic_data.groupby(facet_by + ["year"]) + .sum() + .reset_index() + .melt(id_vars=facet_by + ["year"]) + ) + totals = ( + topic_data.groupby(facet_by + ["year"]) + .count() + .reset_index() + .melt(id_vars=facet_by + ["year"]) + ) + percent = agg.merge(totals, on=facet_by + ["year", "variable"]) + + if fix: + facet_by.append(".") + + else: + agg = topic_data.groupby(["year"]).sum().reset_index().melt(id_vars=["year"]) + totals = ( + topic_data.groupby(["year"]).count().reset_index().melt(id_vars=["year"]) + ) + percent = agg.merge(totals, on=["year", "variable"]) + + # This plot is always done proportionally + percent = percent.assign(value=percent["value_x"] / percent["value_y"]) + percent = percent.assign(variable=pd.Categorical(percent.variable, ordered=True)) + + br = ( + p9.ggplot(percent, p9.aes(x="variable", fill="factor(year)", y="value")) + + p9.geom_bar(show_legend=True, position="dodge", stat="identity") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=sorted(percent["variable"].unique().tolist()), + labels=[ + shorten( + x.replace(topic, "").replace("_", " "), placeholder="...", width=30 + ) + for x in sorted(percent["variable"].unique().tolist()) + ], + ) + ) + + # Uncomment to return dataframe instead of plot + # return percent + + if facet_by: + br = ( + br + + p9.facet_grid( + facet_by, + shrink=False, + labeller=lambda x: "\n".join(wrap(x.replace("/", "/ "), 15)), + ) + + p9.theme( + strip_text_x=p9.element_text(wrap=True, va="bottom", margin={"b": -0.5}) + ) + ) + return br + + +def make_single_bar_chart_multi_year(survey_data, column, facet, proportionally=False): + """Make a barchart showing the number of respondents responding to a single column. + Bars are colored by which year of the survey they correspond to. If facet + is not empty, the resulting plot will be faceted into subplots by the variables + given. + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + column (str): Column to plot responses to + facet (list,optional): List of columns use for grouping + proportionally (bool, optiona ): Defaults to False. If True, + the bars heights are determined proportionally to the + total number of responses in that facet. + + Returns: + (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file + + """ + cols = [column, facet] + show_legend = False + topic_data = survey_data[cols + ["year"]] + + topic_data_long = make_long(topic_data, facet, multi_year=True) + + if proportionally: + proportions = ( + topic_data_long[topic_data_long.rating == 1].groupby(facet + ["year"]).sum() + / topic_data_long.groupby(facet + ["year"]).sum() + ).reset_index() + else: + proportions = ( + topic_data_long[topic_data_long.rating == 1] + .groupby(facet + ["year"]) + .count() + .reset_index() + ) + + x = topic_data_long.columns.tolist() + x.remove("level_1") + + ## Uncomment to return dataframe instead of plot + # return proportions + + return ( + p9.ggplot(proportions, p9.aes(x=facet, fill="year", y="level_1")) + + p9.geom_bar(show_legend=show_legend, stat="identity") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=topic_data_long[facet].unique().tolist(), + labels=[ + x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() + ], + ) + ) + + +def make_likert_chart_multi_year( + survey_data, + topic, + labels, + facet_by=[], + five_is_high=False, + exclude_new_contributors=False, +): + """Make an offset stacked barchart showing the number of respondents at each rank or value for + all columns in the topic. Each column in the topic is a facet, with the years displayed + along the x-axis. + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + labels (list): List of strings to use as labels, corresponding + to the numerical values given by the respondents. + facet_by (list,optional): List of columns use for grouping + five_is_high (bool, optiona ): Defaults to False. If True, + five is considered the highest value in a ranking, otherwise + it is taken as the lowest value. + exclude_new_contributors (bool, optional): Defaults to False. If True, + do not include any responses from contributors with less than + one year of experience + + Returns: + (plotnine.ggplot): Offset stacked barchart plot object which + can be displayed in a notebook or saved out to a file + """ + + facet_by = copy(facet_by) + og_cols = [x for x in survey_data.columns if x.startswith(topic)] + show_legend = True + + topic_data_long = get_multi_year_data_subset( + survey_data, topic, facet_by, exclude_new_contributors + ) + + if not five_is_high: + topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) + + mid_point = 3 if five_is_high else -3 + top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) + + if facet_by: + fix = False + if "." in facet_by: + facet_by.remove(".") + fix = True + + # Calculate proportion for each rank + top_scores = top_scores.merge( + topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), + on=facet_by + ["year"], + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + top_scores = top_scores.assign( + level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) + ) + + bottom_scores = bottom_scores.merge( + topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), + on=facet_by + ["year"], + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + bottom_scores = bottom_scores.assign( + level_1=bottom_scores.level_1_x + * -1 + / (bottom_scores.level_1_y / len(og_cols)) + ) + + if fix: + facet_by.append(".") + else: + # Calculate proportion for each rank + top_scores = top_scores.merge( + topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + top_scores = top_scores.assign( + level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) + ) + + bottom_scores = bottom_scores.merge( + topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + bottom_scores = bottom_scores.assign( + level_1=bottom_scores.level_1_x + * -1 + / (bottom_scores.level_1_y / len(og_cols)) + ) + + vp = ( + p9.ggplot( + topic_data_long, + p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"), + ) + + p9.geom_col( + data=top_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + position=p9.position_stack(reverse=True), + ) + + p9.geom_col( + data=bottom_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + position=p9.position_stack(), + ) + + p9.geom_hline(yintercept=0, color="white") + ) + + if five_is_high: + vp = ( + vp + + p9.scale_color_brewer( + "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels + ) + + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels) + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + ) + else: + vp = ( + vp + + p9.scale_color_brewer( + "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels + ) + + p9.scale_fill_brewer( + "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels + ) + + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left")) + ) + + if facet_by: + facet_by.remove(".") + + else: + facet_by.append(".") + + vp = ( + vp + + p9.facet_grid( + facet_by + ["level_0"], + labeller=lambda x: "\n".join( + wrap( + x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(), + 15, + ) + ), + ) + + p9.theme( + strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1 + ) + ) + + return vp + + +def make_bar_chart(survey_data, topic, facet_by=[], proportional=False): + """Make a barchart showing the number of respondents listing each + column that starts with topic for a single year. If facet_by is + not empty, the resulting plot will be faceted into subplots + by the variables given. + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facet_by (list,optional): List of columns use for grouping + proportional (bool, optiona ): Defaults to False. If True, + the bars heights are determined proportionally to the + total number of responses in that facet. + + Returns: + (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file + """ + show_legend = False + if facet_by: + show_legend = True + + topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by) + + x = topic_data_long.columns.tolist() + x.remove("level_1") + + if facet_by: + period = False + if "." in facet_by: + facet_by.remove(".") + period = True + + aggregate_data = ( + topic_data_long[topic_data_long.rating == 1] + .dropna() + .groupby(["level_0"] + facet_by) + .count() + .reset_index() + ) + + if period: + facet_by.append(".") + + else: + aggregate_data = ( + topic_data_long[topic_data_long.rating == 1] + .dropna() + .groupby("level_0") + .count() + .reset_index() + ) + + if proportional and facet_by: + period = False + if "." in facet_by: + facet_by.remove(".") + period = True + + facet_sums = ( + topic_data_long[topic_data_long.rating == 1] + .dropna() + .groupby(facet_by) + .count() + .reset_index() + ) + + aggregate_data = aggregate_data.merge(facet_sums, on=facet_by).rename( + columns={"level_0_x": "level_0"} + ) + aggregate_data = aggregate_data.assign( + rating=aggregate_data.rating_x / aggregate_data.rating_y + ) + + if period: + facet_by.append(".") + + br = ( + p9.ggplot(aggregate_data, p9.aes(x="level_0", fill="level_0", y="rating")) + + p9.geom_bar(show_legend=show_legend, stat="identity") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=topic_data_long["level_0"].unique().tolist(), + labels=[ + "\n".join( + textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2] + ) + for x in topic_data_long["level_0"].unique().tolist() + ], + ) + ) + + if facet_by: + br = ( + br + + p9.facet_grid( + facet_by, shrink=False, labeller=lambda x: "\n".join(wrap(x, 15)) + ) + + p9.theme( + axis_text_x=p9.element_blank(), + strip_text_x=p9.element_text( + wrap=True, va="bottom", margin={"b": -0.5} + ), + ) + + p9.scale_fill_discrete( + limits=topic_data_long["level_0"].unique().tolist(), + labels=[ + "\n".join( + wrap( + x.replace(topic, "") + .replace("_", " ") + .replace("/", "/ ") + .strip(), + 30, + ) + ) + for x in topic_data_long["level_0"].unique().tolist() + ], + ) + ) + return br + + +def make_likert_chart( + survey_data, + topic, + labels, + facet_by=[], + max_value=5, + max_is_high=False, + wrap_facets=True, + sort_x=False, +): + """Make an offset stacked barchart showing the number of respondents at each rank or value for + all columns in the topic. Each column in the original data is a tick on the x-axis + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + labels (list): List of strings to use as labels, corresponding + to the numerical values given by the respondents. + facet_by (list,optional): List of columns use for grouping + max_value (int, optional): Defaults to 5. The maximuum value a respondent can assign. + max_is_high (bool, optiona ): Defaults to False. If True, + the max_value is considered the highest value in a ranking, otherwise + it is taken as the lowest value. + wrap_facets (bool, optional): Defaults to True. If True, the facet labels are + wrapped + sort_x (bool, optional): Defaults to False. If True, the x-axis is sorted by the + mean value for each column in the original data + + Returns: + (plotnine.ggplot): Offset stacked barchart plot object which + can be displayed in a notebook or saved out to a file + """ + + mid_point = math.ceil(max_value / 2) + + og_cols = [x for x in survey_data.columns if x.startswith(topic)] + show_legend = True + + topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by) + + if not max_is_high: + topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) + + mid_point = -1 * mid_point + + top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) + + if facet_by: + fix = False + if "." in facet_by: + facet_by.remove(".") + fix = True + + top_scores = top_scores.merge( + topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + top_scores = top_scores.assign( + level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) + ) + + bottom_scores = bottom_scores.merge( + topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by + ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) + bottom_scores = bottom_scores.assign( + level_1=bottom_scores.level_1_x + * -1 + / (bottom_scores.level_1_y / len(og_cols)) + ) + + if fix: + facet_by.append(".") + + else: + bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1) + + if sort_x: + x_sort_order = ( + topic_data_long.groupby("level_0") + .mean() + .sort_values("rating") + .reset_index()["level_0"] + .values.tolist() + ) + x_sort_order.reverse() + else: + x_sort_order = topic_data_long["level_0"].unique().tolist() + + vp = ( + p9.ggplot( + topic_data_long, + p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"), + ) + + p9.geom_col( + data=top_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + position=p9.position_stack(reverse=True), + ) + + p9.geom_col( + data=bottom_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + position=p9.position_stack(), + ) + + p9.geom_hline(yintercept=0, color="white") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=x_sort_order, + labels=[ + "\n".join( + textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2] + ) + for x in x_sort_order + ], + ) + ) + + if max_is_high: + vp = ( + vp + + p9.scale_color_brewer( + "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels + ) + + p9.scale_fill_brewer( + "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels + ) + ) + + else: + vp = ( + vp + + reverse_scale_fill_brewer( + "div", + "RdBu", + limits=list(reversed(range(-max_value, 0))), + labels=labels, + ) + + reverse_scale_color_brewer( + "div", + "RdBu", + limits=list(reversed(range(-max_value, 0))), + labels=labels, + ) + ) + + if facet_by: + if wrap_facets: + vp = ( + vp + + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15))) + + p9.theme( + strip_text_x=p9.element_text( + wrap=True, va="bottom", margin={"b": -0.5} + ) + ) + ) + else: + vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x) + return vp + + +def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False): + """Make an offset stacked barchart showing the number of respondents at each rank + or value for a single columns in the original data. Each facet is shown as + a tick on the x-axis + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + labels (list): List of strings to use as labels, corresponding + to the numerical values given by the respondents. + facet (str): Column used for grouping + five_is_high (bool, optionalc): Defaults to False. If True, + 5 is considered the highest value in a ranking, otherwise + it is taken as the lowest value. + + Returns: + (plotnine.ggplot): Offset stacked barchart plot object which + can be displayed in a notebook or saved out to a file + """ + mid_point = 3 + cols = [column, facet] + show_legend = True + topic_data = survey_data[cols] + + topic_data_long = make_long(topic_data, facet) + + if not five_is_high: + topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) + x = topic_data_long.columns.tolist() + x.remove("level_1") + x.remove("level_0") + + if not five_is_high: + mid_point *= -1 + + top_cutoff = topic_data_long["rating"] >= mid_point + bottom_cutoff = topic_data_long["rating"] <= mid_point + + top_scores = ( + topic_data_long[top_cutoff] + .groupby(x) + .count() + .reset_index() + .sort_index(ascending=False) + ) + + top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = ( + top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0 + ) + top_scores = top_scores.merge( + topic_data_long.groupby(facet).count().reset_index(), on=facet + ) + top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y) + + bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index() + bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = ( + bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0 + ) + bottom_scores = bottom_scores.merge( + topic_data_long.groupby(facet).count().reset_index(), on=facet + ) + bottom_scores = bottom_scores.assign( + level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y + ) + + vp = ( + p9.ggplot( + topic_data_long, + p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"), + ) + + p9.geom_col( + data=top_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + position=p9.position_stack(reverse=True), + ) + + p9.geom_col( + data=bottom_scores, + mapping=p9.aes(y="level_1"), + show_legend=show_legend, + size=0.25, + ) + + p9.geom_hline(yintercept=0, color="white") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=topic_data_long[facet].unique().tolist(), + labels=[ + x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() + ], + ) + ) + + if five_is_high: + vp = ( + vp + + p9.scale_color_brewer( + "div", + "RdBu", + limits=[1, 2, 3, 4, 5], + labels=["\n".join(wrap(x, 15)) for x in labels], + ) + + p9.scale_fill_brewer( + "div", + "RdBu", + limits=[1, 2, 3, 4, 5], + labels=["\n".join(wrap(x, 15)) for x in labels], + ) + ) + else: + vp = ( + vp + + reverse_scale_fill_brewer( + "div", + "RdBu", + limits=[-1, -2, -3, -4, -5], + labels=["\n".join(wrap(x, 15)) for x in labels], + ) + + reverse_scale_color_brewer( + "div", + "RdBu", + limits=[-1, -2, -3, -4, -5], + labels=["\n".join(wrap(x, 15)) for x in labels], + ) + ) + + return vp + + +def make_single_bar_chart( + survey_data, column, facet, proportionally=False, facet2=None +): + """Make a barchart showing the number of respondents marking + a certain column in the original dataset as True. The facet + variable values are used as ticks on the x-axis + + Args: + survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey + topic (str): String that all questions of interest start with + facet (str): Column use for grouping + proportional (bool, optiona ): Defaults to False. If True, + the bars heights are determined proportionally to the + total number of responses in that facet. + facet2 (str, optional): If provided, a second variable to facet against. + + Returns: + (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file + """ + cols = [column, facet] + if facet2: + cols.append(facet2) + show_legend = False + topic_data = survey_data[cols] + + grouper = [facet, facet2] if facet2 else facet + topic_data_long = make_long(topic_data, grouper) + + if proportionally: + proportions = ( + topic_data_long[topic_data_long.rating == 1].groupby(grouper).sum() + / topic_data_long.groupby(grouper).sum() + ).reset_index() + else: + proportions = ( + topic_data_long[topic_data_long.rating == 1] + .groupby(grouper) + .count() + .reset_index() + ) + + x = topic_data_long.columns.tolist() + x.remove("level_1") + + br = ( + p9.ggplot(proportions, p9.aes(x=facet, fill=facet, y="level_1")) + + p9.geom_bar(show_legend=show_legend, stat="identity") + + p9.theme( + axis_text_x=p9.element_text(angle=45, ha="right"), + strip_text_y=p9.element_text(angle=0, ha="left"), + ) + + p9.scale_x_discrete( + limits=topic_data_long[facet].unique().tolist(), + labels=[ + x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() + ], + ) + ) + + if facet2: + br = br + p9.facet_grid([facet2, "."]) + + return br diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py new file mode 100644 index 00000000..1a9e3aba --- /dev/null +++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2018.py @@ -0,0 +1,118 @@ +import pandas as pd +import numpy as np + + +convert_2018_to_2019 = { + 'Blocker:_Code/Doc_review':'Blocker:_Code/Documentation_review', + 'Blocker:_GH_tools&processes_(not_our_customized_tooling)': 'Blocker:_GitHub_tools_and_processes_(not_our_customized_tooling)', + 'Blocker:_Finding_a/the_right_SIG': 'Blocker:_Finding_the_right_SIG_for_your_contributions', + 'Blocker:_Finding_issues_to_work_on': 'Blocker:_Finding_appropriate_issues_to_work_on', + 'Blocker:_Setting_up_dev_env': 'Blocker:_Setting_up_development_environment', + 'Use_freq:_Zoom_Mtgs': 'Use_freq:_Zoom_video_conferencing/meetings', + 'Use_freq:_GH_(comments,_issues,_prs)': 'Use_freq:_Discussions_on_Github_Issues_and_PRs', + 'Use_freq:_Unofficial(Twitter,_Reddit,_etc.)':'Use_freq:_Unofficial_channels_(IRC,_WeChat,_etc.)', + 'Use_freq:_YT_Recordings': 'Use_freq:_YouTube_recordings_(community_meetings,_SIG/WG_meetings,_etc.)', + 'Use_freq:_GDocs/Forms/Sheets,_etc_(meeting_agendas,_etc)': 'Use_freq:_Google_Docs/Forms/Sheets,_etc_(meeting_agendas,_etc)', + 'Contribute:_code_to_k/k': 'Contribute:_Core_code_inside_of_kubernetes/kubernetes', + 'Contribute:_code_in_a_k/*_GH_org': 'Contribute:_Code_inside_of_another_repo_in_the_Kubernetes_GitHub_Org_(example:_/kubernetes-sigs,_kubernetes/website,_etc)', + 'Contribute:_Docs':'Contribute:_Documentation', + 'Contribute:_Testing_and_CI':'Contribute:_Testing_&_Infrastructure', + 'Contribute:_Related_projects_(Kubeadm,_Helm,_container_runtimes,_etc.)': 'Contribute:_Related_projects_(Helm,_container_runtimes,_other_CNCF_projects,_etc.)', + 'Contribute:_Not_yet': 'Contribute:_Don’t_contribute_yet,_hoping_to_start_soon', + 'Contribute:_Other': 'Contribute:_Other_(please_specify)', + 'Level_of_Contributor_Laddor':'Level_of_Contributor', + 'Most_Important_Proj:_Mentoring_programs':'Most_Important_Proj:_Mentoring_programs_for_all_contributor_levels/roles\xa0(https://git.k8s.io/community/community-membership.md)', + 'Most_Important_Proj:_GH_Mgmt':'Most_Important_Proj:_GitHub_Management', + 'Most_Important_Proj:_Contributor_Summits':'Most_Important_Proj:_Delivering_valuable_contributor_summits_at_relevant_events', + 'Most_Important_Proj:_Keeping_community_safe': 'Most_Important_Proj:_Keeping_our_community_safe_on_our_various_communication_platforms_through_moderation_guidelines_and_new_approaches', + 'Check_for_news:_k-dev_ML':'Check_for_news:_kubernetes-dev@_mailing_list', + 'Check_for_news:_discuss.kubernetes.io':'Check_for_news:_Dedicated_discuss.k8s.io_forum_for_contributors', + 'Check_for_news:_contribex_ML':'Check_for_news:_kubernetes-sig-contribex@\xa0mailing_list', + 'Check_for_news:_Slack':'Check_for_news:_#kubernetes-dev,_#sig-foo,_#sig-contribex_slack', + 'Check_for_news:_Twitter_read_first_':'Check_for_news:_@kubernetesio_twitter', + 'Check_for_news:_Kubernetes_blog_read_first_':'Check_for_news:_Kubernetes_blog', + 'Check_for_news:_k/community_repo_in_GH_(Issues_and/or_PRs)_read_first':'Check_for_news:_kubernetes/community_repo_in_GitHub_(Issues_and/or_PRs)', + 'Check_for_news:_Other':'Check_for_news:_Other_(please_specify)', + 'Attended:_#_of_ContribSummits':'How_many_Kubernetes_Contributor_Summits_have_you_attended', + 'HelpWanted_&/or_GoodFirstIssue_label_usage':'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors', + 'Watched_or_participated_in_MoC':'Have_you_watched_or_participated_in_an_episode_of_our_YouTube_mentoring_series_Meet_Our_Contributors_If_you_have_specific_suggestions,_leave_them_at_the_end_of_the_survey.', + 'Make_project_easier_to_contribute':'Are_there_specific_ways_the_project_could_make_contributing_easier_for_you' +} + +contrib_length_2018_to_2019 = { + '1-2 years': 'one to two years', + '2-3 years': 'two to three years', + '3+ years': 'three+ years', + '6 months-1 year':'less than one year', + 'Just started': 'less than one year' +} + +ladder_level_2018_to_2019 = { + "Approver": "approver", + "Had no idea this was even a thing": "there's a contributor ladder?", + "Org Member": "member", + "Reviewer": "reviewer", + "I’m not an org member yet, but working on it": "not yet a member but working on it", + "Subproject Owner": "subproject owner" +} + +employer_2018_to_2019 = { + "It’s complicated": "it's complicated.", + "It’s entirely on my own time": "no, I need to use my own time", + "Yes, it’s part of my job": "yes, I can contribute on company time", + 'No, but I’m able to use “free” time at work': "yes, I can contribute on company time" +} + +oss_projects_2018_to_2019 = { + 'None, Kubernetes is my first one!': 'this is my first open source project!', + 'One more':'1 other', + '2-4' : '2 or more', + '4+': '2 or more' +} + +help_wanted_2018_to_2019 = { + "No, because I didn't know they were there": "No", + "No, because I don't think my issues qualify": "No", + 'Not as much as I should because I forget' : "Rarely (for reasons)" +} + +next_level_interest_2018_2019 = { + 'Yes, but would like mentorship.': 'if I had help/mentoring/support', + 'Yes, but not sure I have time.': 'if I had more free time', + 'Yes, doing it on my own.': 'yes', + "No, I'm already an owner": 'no, already a subproject owner (highest level on the ladder)', + 'Not really': 'no' +} + +def get_df(path): + + survey_data = pd.read_csv(path) + + #Clean Data + for x in survey_data.columns: + if x.startswith("Useful:"): + survey_data = survey_data.assign(**{x: survey_data[x].fillna(0)}) + if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Most Important Pr"): + survey_data = survey_data.assign(**{x: np.where(survey_data[x].isna(),0,1)}) + if x.startswith('Upstream'): + survey_data = survey_data.assign(**{x: survey_data[x].fillna("Didn't Answer")}) + + + + survey_data = survey_data.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in survey_data.columns}) + + survey_data = survey_data.drop('Use_freq:_discuss.kubernetes.io',axis=1) + + x = pd.to_datetime(survey_data.End_Date) + survey_data = survey_data.assign(date_taken = x.dt.date) + survey_data = survey_data.assign(Contributing_Length = survey_data['Contributing_Length'].apply(contrib_length_2018_to_2019.get)) + + survey_data = survey_data.rename(columns=convert_2018_to_2019) + + survey_data = survey_data.assign(Level_of_Contributor = survey_data['Level_of_Contributor'].apply(lambda x: ladder_level_2018_to_2019.get(x,x))) + survey_data = survey_data.assign(Upstream_supported_at_employer = survey_data['Upstream_supported_at_employer'].apply(lambda x: employer_2018_to_2019.get(x,x))) + survey_data = survey_data.assign(Interested_in_next_level = survey_data['Interested_in_next_level'].apply(lambda x: next_level_interest_2018_2019.get(x,x) )) + survey_data = survey_data.assign(Contribute_to_other_OSS = survey_data['Contribute_to_other_OSS'].apply(lambda x: oss_projects_2018_to_2019.get(x,x))) + survey_data.loc[:,'Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'] = survey_data['Do_you_use_the\xa0Help_Wanted_and/or_Good_First_Issue_labels_on_issues_you_file_to_find_contributors'].apply(lambda x: help_wanted_2018_to_2019.get(x,x)) + + return survey_data diff --git a/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py new file mode 100644 index 00000000..ca8e6787 --- /dev/null +++ b/sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py @@ -0,0 +1,117 @@ +import pandas as pd +import numpy as np + +fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv' + +contribute_header = "What areas of Kubernetes do you contribute to? Please check all that apply." +blockers_header = "Please rate any challenges to the listed steps of the contribution process" +agree_header = "Do you agree with the following statements (1 - strongly disagree, 5 - strongly agree):" +attend_header = "Which of the below would make you likely to attend more of the Community Meetings? Check all that apply." +most_important_proj_header = "Some of the major projects SIG Contributor Experience is working on are listed below, rank the ones that are most important to you (and/or your SIG)" +use_freq_header = "Of our various communications channels, please rate which ones you use and/or check most frequently on a 1-5 scale, where 1 is “never”, 3 is “several times a month” and 5 is “every day”." +news_header = "Which of these channels is most likely to reach you first for news about decisions, changes, additions, and/or announcements to the contributor process or community matters?" + +def map_blocker_and_usefreq_vals(val): + try: + return int(val) + except ValueError: + return int(val[0]) + +def process_header(df): + columns = list(df.columns) + new_columns = [None]*len(columns) + for i, col in enumerate(columns): + if col[1].startswith("Unnamed") or col[1] == "Response": + new_columns[i] = col[0] + continue + + # Find the starting column for the multilabel responses (checkboxes) + # that were also in the 2018 survey + if col[0] == blockers_header: + blockers_i = i + elif col[0] == contribute_header: + contribute_i = i + elif col[0] == news_header: + news_i = i + elif col[0] == use_freq_header: + use_freq_i = i + elif col[0] == most_important_proj_header: + most_important_proj_i = i + elif col[0] == agree_header: # Starting columns for multilabel responses that weren't in the 2018 survey. + agree_i = i + elif col[0] == attend_header: + attend_i = i + #elif col[0] == unattendance_header: + # unattendance_i = i + else: # Handle open ended responses + new_columns[i] = col[0] + + def prefix_cols(header, header_i, prefix): + i = header_i + while i < len(columns) and (columns[i][0].startswith("Unnamed") or columns[i][0] == header): + new_columns[i] = "{} {}".format(prefix, columns[i][1]) + i += 1 + + prefix_cols(contribute_header, contribute_i, "Contribute:") + prefix_cols(blockers_header, blockers_i, "Blocker:") + prefix_cols(news_header, news_i, "Check for news:") + prefix_cols(use_freq_header, use_freq_i, "Use freq:") + prefix_cols(most_important_proj_header, most_important_proj_i, "Most Important Project:") + + prefix_cols(agree_header, agree_i, "Agree:") + prefix_cols(attend_header, attend_i, "Would attend if:") + + df.columns = new_columns + +def get_df(file_name=None): + fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv' + if file_name: + fn = file_name + + df = pd.read_csv(fn, header=[0,1], skipinitialspace=True) + process_header(df) + + df = df.rename(columns={ + "How long have you been contributing to Kubernetes?": "Contributing_Length", + "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor", + "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor", + "What region of the world are you in?": "World_Region", + "Are you interested in advancing to the next level of the Contributor Ladder?": "Interested_in_next_level", + "How many other open source projects not in the Kubernetes ecosystem do you contribute to? (example: nodejs, debian)":"Contribute_to_other_OSS", + "Does your employer support your contributions to Kubernetes?":"Upstream_supported_at_employer", + "Blocker: Other (please specify)": "Other blockers (please specify)", + "What region of the world are you in?": "World Region", + }) + + def map_blocker_and_usefreq_vals(val): + try: + return int(val) + except ValueError: + return int(val[0]) + + #Clean Data + for x in df.columns: + if x.startswith("Useful:"): + df = df.assign(**{x: df[x].fillna(0)}) + if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Would attend if:"): + df = df.assign(**{x: np.where(df[x].isna(),0,1)}) + if x.startswith('Upstream'): + df = df.assign(**{x: df[x].fillna("Didn't Answer")}) + if x.startswith("Blocker:") and x != "Blocker: Other (please specify)": + df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore") + if x.startswith("Use freq:") or x.startswith("Agree:"): + df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore") + + + df = df.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in df.columns}) + + x = pd.to_datetime(df.End_Date) + df = df.assign(date_taken = x.dt.date) + + return df + +# TODO NOTE I should only be dropping these at plot time +#df.dropna(subset=["Level_of_Contributor", +# "Interested_in_next_level", +# "Upstream_supported_at_employer"], inplace=True) + |
