from textwrap import wrap import math import plotnine as p9 import pandas as pd import textwrap from textwrap import shorten from matplotlib import pyplot as plt from copy import copy from mizani.palettes import brewer_pal from plotnine.scales.scale import scale_discrete # Custom scales for plotnine that reverse the direction of the colors class reverse_scale_color_brewer(p9.scale_color_brewer): def __init__(self, type="seq", palette=1, direction=-1, **kwargs): self.palette = brewer_pal(type, palette, direction) scale_discrete.__init__(self, **kwargs) class reverse_scale_fill_brewer(p9.scale_fill_brewer): def __init__(self, type="seq", palette=1, direction=-1, **kwargs): self.palette = brewer_pal(type, palette, direction) scale_discrete.__init__(self, **kwargs) def split_for_likert(topic_data_long, mid_point): """ Returns the aggregated counts for ratings in the top and bottom halves of the of each category, necssary for making offset bar charts Args: topic_data_long (pandas.Dataframe): A pandas Dataframe storing each respondents ratings for a given topic, in long format mid_point (int): The midpoint to use to split the into two halves, based on ratings Returns: (tuple): Tuple containing: (pandas.DataFrame): Aggregated counts for ratings greater than or equal to the midpoinnt (pandas.DataFrame): Aggregated counts for ratings less than or equal to the midpoinnt """ x = topic_data_long.columns.tolist() x.remove("level_1") top_cutoff = topic_data_long["rating"] >= mid_point bottom_cutoff = topic_data_long["rating"] <= mid_point top_scores = ( topic_data_long[top_cutoff] .groupby(x) .count() .reindex( pd.MultiIndex.from_product( [topic_data_long[y].unique().tolist() for y in x], names=x ), fill_value=0, ) .reset_index() .sort_index(ascending=False) ) # The mid point is in both the top and bottom halves, so divide by two top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = ( top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0 ) bottom_scores = ( topic_data_long[bottom_cutoff] .groupby(x) .count() .reindex( pd.MultiIndex.from_product( [topic_data_long[y].unique().tolist() for y in x], names=x ), fill_value=0, ) .reset_index() ) # The mid point is in both the top and bottom halves, so divide by two bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = ( bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0 ) return top_scores, bottom_scores def make_long(data, facets, multi_year=False): """Converts a wide dataframe with columns for each topic's rating into a long dataframe Args: data (pandas.DataFrame): A wide dataframe facets (list): List of columns to keep as their own column mulit_year (bool, optional) Defaults to False. If True, add the "year" column to the list of facets Returns: (pandas.DataFrame): Long dataframe """ facets = copy(facets) if multi_year: facets.append("year") long_data = data.set_index(facets, append=True).stack().reset_index() # Rename so Level_0 always has the values of the topic we are interested in long_data = long_data.rename( columns={ "level_0": "level_1", "level_4": "level_0", "level_3": "level_0", "level_2": "level_0", 0: "rating", } ) long_data = long_data.assign( level_0=pd.Categorical(long_data.level_0, ordered=True) ) return long_data def get_data_subset( survey_data, topic, facets=[], exclude_new_contributors=False, include_year=False ): """Get only the relevant columns from the data Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facets (list, optional): List of columns use for grouping exclude_new_contributors: (bool, optional) Defaults to False. If True, remove all responses from contributors who have been involved a year or less. include_year: (bool, optional) Defaults to False. If True, include the year column in the output Returns: (pandas.DataFrame): Survey dataframe with only columns relevant to the topics and facets remaining. """ og_cols = [x for x in survey_data.columns if x.startswith(topic)] facets = copy(facets) if include_year: facets.append("year") if facets: if "." in facets: facets.remove(".") cols = og_cols + facets facets.append(".") else: cols = og_cols + facets else: cols = og_cols if exclude_new_contributors: topic_data = survey_data[ survey_data["Contributing_Length"] != "less than one year" ][cols] else: topic_data = survey_data[cols] return topic_data def get_multi_year_data_subset( survey_data, topic, facet_by=[], exclude_new_contributors=False ): """Get appropriate data for multi-year plots and convert it to long form Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facet_by (list, optional): List of columns use for grouping exclude_new_contributors (bool, optional) Defaults to False. If True, remove all responses from contributors who have been involved a year or less. Returns: (pandas.DataFrame): Long dataframe """ topic_data = get_data_subset( survey_data, topic, facet_by, exclude_new_contributors, include_year=True ) if facet_by: if "." in facet_by: facet_by.remove(".") topic_data_long = make_long(topic_data, facet_by, multi_year=True) facet_by.append(".") else: topic_data_long = make_long(topic_data, facet_by, multi_year=True) else: topic_data_long = make_long(topic_data, [], multi_year=True) return topic_data_long def get_single_year_data_subset(survey_data, topic, facet_by=[]): """Get appropriate data for single-year plots and convert it to long form Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facet_by (list, optional): List of columns use for grouping Returns: (pandas.DataFrame): Long dataframe """ topic_data = get_data_subset(survey_data, topic, facet_by) if facet_by: if "." in facet_by: facet_by.remove(".") topic_data_long = make_long(topic_data, facet_by) facet_by.append(".") else: topic_data_long = make_long(topic_data, facet_by) else: topic_data_long = ( topic_data.unstack().reset_index().rename(columns={0: "rating"}) ) topic_data_long = topic_data_long.assign( level_0=pd.Categorical(topic_data_long.level_0, ordered=True) ) return topic_data_long def make_bar_chart_multi_year( survey_data, topic, facet_by=[], exclude_new_contributors=False ): """Make a barchart showing proportions of respondents listing each column that starts with topic. Bars are colored by which year of the survey they correspond to. If facet_by is not empty, the resulting plot will be faceted into subplots by the variables given. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facet_by (list,optional): List of columns use for grouping exclude_new_contributors (bool, optiona ): Defaults to False. If True, do not include any responses from contributors with less than one year of experience Returns: (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file """ topic_data = get_data_subset( survey_data, topic, facet_by, exclude_new_contributors, include_year=True ) if facet_by: fix = False if "." in facet_by: facet_by.remove(".") fix = True agg = ( topic_data.groupby(facet_by + ["year"]) .sum() .reset_index() .melt(id_vars=facet_by + ["year"]) ) totals = ( topic_data.groupby(facet_by + ["year"]) .count() .reset_index() .melt(id_vars=facet_by + ["year"]) ) percent = agg.merge(totals, on=facet_by + ["year", "variable"]) if fix: facet_by.append(".") else: agg = topic_data.groupby(["year"]).sum().reset_index().melt(id_vars=["year"]) totals = ( topic_data.groupby(["year"]).count().reset_index().melt(id_vars=["year"]) ) percent = agg.merge(totals, on=["year", "variable"]) # This plot is always done proportionally percent = percent.assign(value=percent["value_x"] / percent["value_y"]) percent = percent.assign(variable=pd.Categorical(percent.variable, ordered=True)) br = ( p9.ggplot(percent, p9.aes(x="variable", fill="factor(year)", y="value")) + p9.geom_bar(show_legend=True, position="dodge", stat="identity") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=sorted(percent["variable"].unique().tolist()), labels=[ shorten( x.replace(topic, "").replace("_", " "), placeholder="...", width=30 ) for x in sorted(percent["variable"].unique().tolist()) ], ) ) # Uncomment to return dataframe instead of plot # return percent if facet_by: br = ( br + p9.facet_grid( facet_by, shrink=False, labeller=lambda x: "\n".join(wrap(x.replace("/", "/ "), 15)), ) + p9.theme( strip_text_x=p9.element_text(wrap=True, va="bottom", margin={"b": -0.5}) ) ) return br def make_single_bar_chart_multi_year(survey_data, column, facet, proportionally=False): """Make a barchart showing the number of respondents responding to a single column. Bars are colored by which year of the survey they correspond to. If facet is not empty, the resulting plot will be faceted into subplots by the variables given. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey column (str): Column to plot responses to facet (list,optional): List of columns use for grouping proportionally (bool, optiona ): Defaults to False. If True, the bars heights are determined proportionally to the total number of responses in that facet. Returns: (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file """ cols = [column, facet] show_legend = False topic_data = survey_data[cols + ["year"]] topic_data_long = make_long(topic_data, facet, multi_year=True) if proportionally: proportions = ( topic_data_long[topic_data_long.rating == 1].groupby(facet + ["year"]).sum() / topic_data_long.groupby(facet + ["year"]).sum() ).reset_index() else: proportions = ( topic_data_long[topic_data_long.rating == 1] .groupby(facet + ["year"]) .count() .reset_index() ) x = topic_data_long.columns.tolist() x.remove("level_1") ## Uncomment to return dataframe instead of plot # return proportions return ( p9.ggplot(proportions, p9.aes(x=facet, fill="year", y="level_1")) + p9.geom_bar(show_legend=show_legend, stat="identity") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long[facet].unique().tolist(), labels=[ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() ], ) ) def make_likert_chart_multi_year( survey_data, topic, labels, facet_by=[], five_is_high=False, exclude_new_contributors=False, ): """Make an offset stacked barchart showing the number of respondents at each rank or value for all columns in the topic. Each column in the topic is a facet, with the years displayed along the x-axis. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet_by (list,optional): List of columns use for grouping five_is_high (bool, optiona ): Defaults to False. If True, five is considered the highest value in a ranking, otherwise it is taken as the lowest value. exclude_new_contributors (bool, optional): Defaults to False. If True, do not include any responses from contributors with less than one year of experience Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ facet_by = copy(facet_by) og_cols = [x for x in survey_data.columns if x.startswith(topic)] show_legend = True topic_data_long = get_multi_year_data_subset( survey_data, topic, facet_by, exclude_new_contributors ) if not five_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) mid_point = 3 if five_is_high else -3 top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) if facet_by: fix = False if "." in facet_by: facet_by.remove(".") fix = True # Calculate proportion for each rank top_scores = top_scores.merge( topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), on=facet_by + ["year"], ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), on=facet_by + ["year"], ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) if fix: facet_by.append(".") else: # Calculate proportion for each rank top_scores = top_scores.merge( topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) vp = ( p9.ggplot( topic_data_long, p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(), ) + p9.geom_hline(yintercept=0, color="white") ) if five_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels ) + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels) + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) ) else: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels ) + p9.scale_fill_brewer( "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels ) + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left")) ) if facet_by: facet_by.remove(".") else: facet_by.append(".") vp = ( vp + p9.facet_grid( facet_by + ["level_0"], labeller=lambda x: "\n".join( wrap( x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(), 15, ) ), ) + p9.theme( strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1 ) ) return vp def make_bar_chart(survey_data, topic, facet_by=[], proportional=False): """Make a barchart showing the number of respondents listing each column that starts with topic for a single year. If facet_by is not empty, the resulting plot will be faceted into subplots by the variables given. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facet_by (list,optional): List of columns use for grouping proportional (bool, optiona ): Defaults to False. If True, the bars heights are determined proportionally to the total number of responses in that facet. Returns: (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file """ show_legend = False if facet_by: show_legend = True topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by) x = topic_data_long.columns.tolist() x.remove("level_1") if facet_by: period = False if "." in facet_by: facet_by.remove(".") period = True aggregate_data = ( topic_data_long[topic_data_long.rating == 1] .dropna() .groupby(["level_0"] + facet_by) .count() .reset_index() ) if period: facet_by.append(".") else: aggregate_data = ( topic_data_long[topic_data_long.rating == 1] .dropna() .groupby("level_0") .count() .reset_index() ) if proportional and facet_by: period = False if "." in facet_by: facet_by.remove(".") period = True facet_sums = ( topic_data_long[topic_data_long.rating == 1] .dropna() .groupby(facet_by) .count() .reset_index() ) aggregate_data = aggregate_data.merge(facet_sums, on=facet_by).rename( columns={"level_0_x": "level_0"} ) aggregate_data = aggregate_data.assign( rating=aggregate_data.rating_x / aggregate_data.rating_y ) if period: facet_by.append(".") br = ( p9.ggplot(aggregate_data, p9.aes(x="level_0", fill="level_0", y="rating")) + p9.geom_bar(show_legend=show_legend, stat="identity") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long["level_0"].unique().tolist(), labels=[ "\n".join( textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2] ) for x in topic_data_long["level_0"].unique().tolist() ], ) ) if facet_by: br = ( br + p9.facet_grid( facet_by, shrink=False, labeller=lambda x: "\n".join(wrap(x, 15)) ) + p9.theme( axis_text_x=p9.element_blank(), strip_text_x=p9.element_text( wrap=True, va="bottom", margin={"b": -0.5} ), ) + p9.scale_fill_discrete( limits=topic_data_long["level_0"].unique().tolist(), labels=[ "\n".join( wrap( x.replace(topic, "") .replace("_", " ") .replace("/", "/ ") .strip(), 30, ) ) for x in topic_data_long["level_0"].unique().tolist() ], ) ) return br def make_likert_chart( survey_data, topic, labels, facet_by=[], max_value=5, max_is_high=False, wrap_facets=True, sort_x=False, ): """Make an offset stacked barchart showing the number of respondents at each rank or value for all columns in the topic. Each column in the original data is a tick on the x-axis Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet_by (list,optional): List of columns use for grouping max_value (int, optional): Defaults to 5. The maximuum value a respondent can assign. max_is_high (bool, optiona ): Defaults to False. If True, the max_value is considered the highest value in a ranking, otherwise it is taken as the lowest value. wrap_facets (bool, optional): Defaults to True. If True, the facet labels are wrapped sort_x (bool, optional): Defaults to False. If True, the x-axis is sorted by the mean value for each column in the original data Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ mid_point = math.ceil(max_value / 2) og_cols = [x for x in survey_data.columns if x.startswith(topic)] show_legend = True topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by) if not max_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) mid_point = -1 * mid_point top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) if facet_by: fix = False if "." in facet_by: facet_by.remove(".") fix = True top_scores = top_scores.merge( topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) if fix: facet_by.append(".") else: bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1) if sort_x: x_sort_order = ( topic_data_long.groupby("level_0") .mean() .sort_values("rating") .reset_index()["level_0"] .values.tolist() ) x_sort_order.reverse() else: x_sort_order = topic_data_long["level_0"].unique().tolist() vp = ( p9.ggplot( topic_data_long, p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(), ) + p9.geom_hline(yintercept=0, color="white") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=x_sort_order, labels=[ "\n".join( textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2] ) for x in x_sort_order ], ) ) if max_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels ) + p9.scale_fill_brewer( "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels ) ) else: vp = ( vp + reverse_scale_fill_brewer( "div", "RdBu", limits=list(reversed(range(-max_value, 0))), labels=labels, ) + reverse_scale_color_brewer( "div", "RdBu", limits=list(reversed(range(-max_value, 0))), labels=labels, ) ) if facet_by: if wrap_facets: vp = ( vp + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15))) + p9.theme( strip_text_x=p9.element_text( wrap=True, va="bottom", margin={"b": -0.5} ) ) ) else: vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x) return vp def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False): """Make an offset stacked barchart showing the number of respondents at each rank or value for a single columns in the original data. Each facet is shown as a tick on the x-axis Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet (str): Column used for grouping five_is_high (bool, optionalc): Defaults to False. If True, 5 is considered the highest value in a ranking, otherwise it is taken as the lowest value. Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ mid_point = 3 cols = [column, facet] show_legend = True topic_data = survey_data[cols] topic_data_long = make_long(topic_data, facet) if not five_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) x = topic_data_long.columns.tolist() x.remove("level_1") x.remove("level_0") if not five_is_high: mid_point *= -1 top_cutoff = topic_data_long["rating"] >= mid_point bottom_cutoff = topic_data_long["rating"] <= mid_point top_scores = ( topic_data_long[top_cutoff] .groupby(x) .count() .reset_index() .sort_index(ascending=False) ) top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = ( top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0 ) top_scores = top_scores.merge( topic_data_long.groupby(facet).count().reset_index(), on=facet ) top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y) bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index() bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = ( bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0 ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet).count().reset_index(), on=facet ) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y ) vp = ( p9.ggplot( topic_data_long, p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, ) + p9.geom_hline(yintercept=0, color="white") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long[facet].unique().tolist(), labels=[ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() ], ) ) if five_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=["\n".join(wrap(x, 15)) for x in labels], ) + p9.scale_fill_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=["\n".join(wrap(x, 15)) for x in labels], ) ) else: vp = ( vp + reverse_scale_fill_brewer( "div", "RdBu", limits=[-1, -2, -3, -4, -5], labels=["\n".join(wrap(x, 15)) for x in labels], ) + reverse_scale_color_brewer( "div", "RdBu", limits=[-1, -2, -3, -4, -5], labels=["\n".join(wrap(x, 15)) for x in labels], ) ) return vp def make_single_bar_chart( survey_data, column, facet, proportionally=False, facet2=None ): """Make a barchart showing the number of respondents marking a certain column in the original dataset as True. The facet variable values are used as ticks on the x-axis Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with facet (str): Column use for grouping proportional (bool, optiona ): Defaults to False. If True, the bars heights are determined proportionally to the total number of responses in that facet. facet2 (str, optional): If provided, a second variable to facet against. Returns: (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file """ cols = [column, facet] if facet2: cols.append(facet2) show_legend = False topic_data = survey_data[cols] grouper = [facet, facet2] if facet2 else facet topic_data_long = make_long(topic_data, grouper) if proportionally: proportions = ( topic_data_long[topic_data_long.rating == 1].groupby(grouper).sum() / topic_data_long.groupby(grouper).sum() ).reset_index() else: proportions = ( topic_data_long[topic_data_long.rating == 1] .groupby(grouper) .count() .reset_index() ) x = topic_data_long.columns.tolist() x.remove("level_1") br = ( p9.ggplot(proportions, p9.aes(x=facet, fill=facet, y="level_1")) + p9.geom_bar(show_legend=show_legend, stat="identity") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long[facet].unique().tolist(), labels=[ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() ], ) ) if facet2: br = br + p9.facet_grid([facet2, "."]) return br