diff --git a/dashboard_AI.py b/dashboard_AI.py index 3fdf7cc..689a4d1 100644 --- a/dashboard_AI.py +++ b/dashboard_AI.py @@ -176,373 +176,673 @@ def call_claude(messages): if report.get("source"): st.markdown(f'๐Ÿ“Š {report["source"]}', unsafe_allow_html=True) -# โ”€โ”€ MAIN LAYOUT: left 62% content | right 38% chatbot โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -main_col, chat_col = st.columns([0.62, 0.38]) - -with main_col: - # Metric cards - c1, c2, c3, c4 = st.columns(4) - with c1: - st.markdown(f"""
-
Monthly opportunity
-
${total_monthly:,.0f}
-
recoverable now
-
""", unsafe_allow_html=True) - with c2: - st.markdown(f"""
-
Annual opportunity
-
${total_annual:,.0f}
-
if unaddressed
-
""", unsafe_allow_html=True) - with c3: - st.markdown(f"""
-
Findings
-
{len(findings)}
-
services flagged
-
""", unsafe_allow_html=True) - with c4: - if total_spend > 0: - pct = round((total_monthly / total_spend) * 100, 1) - st.markdown(f"""
-
Total spend
-
${total_spend:,.0f}
-
{pct}% recoverable
-
""", unsafe_allow_html=True) - else: - top_f = findings[0] if findings else {} - st.markdown(f"""
-
Top finding
-
{top_f.get('name','โ€”')[:12]}
-
${top_f.get('monthly_saving',0):,.0f}/mo
-
""", unsafe_allow_html=True) - - st.markdown("
", unsafe_allow_html=True) - - # AI summary - st.markdown(f'
๐Ÿค– AI Summary
{report["executive_summary"]}
', - unsafe_allow_html=True) - - # Charts - chart_l, chart_r = st.columns(2) - with chart_l: - st.markdown("#### Cost by service") - src = raw_services or [] - if src: - svc_df = pd.DataFrame([ - {"Service": s["service"][:22], "April ($)": s["apr_2026"]} - for s in sorted(src, key=lambda x: -x["apr_2026"])[:8] - ]) - fig = px.bar(svc_df, x="April ($)", y="Service", orientation="h", - color="April ($)", color_continuous_scale=["#fde8e8","#e05252"], text="April ($)") - fig.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") - fig.update_layout(showlegend=False, coloraxis_showscale=False, - plot_bgcolor="white", paper_bgcolor="white", - margin=dict(l=0,r=60,t=10,b=0), height=260, - yaxis=dict(showgrid=False), xaxis=dict(showgrid=True,gridcolor="#f0f0f0")) - st.plotly_chart(fig, use_container_width=True) - elif all_f_legacy: - svc_t = {} - for f in all_f_legacy: - svc_t[f.get("service","Other")] = svc_t.get(f.get("service","Other"),0)+f.get("monthly_waste_usd",0) - sdf = pd.DataFrame([{"Service":k,"Waste ($)":round(v,2)} for k,v in sorted(svc_t.items(),key=lambda x:-x[1])]) - fig = px.bar(sdf,x="Waste ($)",y="Service",orientation="h", - color="Waste ($)",color_continuous_scale=["#fde8e8","#e05252"],text="Waste ($)") - fig.update_traces(texttemplate="$%{text:,.0f}",textposition="outside") - fig.update_layout(showlegend=False,coloraxis_showscale=False, - plot_bgcolor="white",paper_bgcolor="white", - margin=dict(l=0,r=60,t=10,b=0),height=260, - yaxis=dict(showgrid=False),xaxis=dict(showgrid=True,gridcolor="#f0f0f0")) - st.plotly_chart(fig, use_container_width=True) - - with chart_r: - st.markdown("#### Opportunity by category") - cat_t = {} - for f in findings: - cat_t[f["category"]] = cat_t.get(f["category"],0) + f["monthly_saving"] - if cat_t: - cdf = pd.DataFrame([{"Category":k,"Opp ($)":round(v,2)} for k,v in sorted(cat_t.items(),key=lambda x:-x[1]) if v>0]) - fig2 = px.pie(cdf,values="Opp ($)",names="Category", - color_discrete_sequence=["#e05252","#f59e0b","#3b82f6","#8b5cf6","#10b981"],hole=0.45) - fig2.update_traces(textposition="outside",textinfo="label+percent") - fig2.update_layout(showlegend=False,paper_bgcolor="white", - margin=dict(l=0,r=0,t=10,b=0),height=260) - st.plotly_chart(fig2, use_container_width=True) - - # Quick wins - if quick_wins: - st.markdown("#### โšก Quick wins") - for w in quick_wins[:3]: - st.markdown(f'
โœ… {w}
', unsafe_allow_html=True) - - st.markdown("
", unsafe_allow_html=True) - - # Findings - st.markdown("#### ๐Ÿ” Flagged services") - filtered = [f for f in findings if f["category"] in selected_cats and f["severity"] in selected_sev] - if not filtered: - st.info("No findings match filters.") - else: - show_action = st.toggle("Show AWS remediation actions", value=False) - for f in filtered: - sev = f["severity"].lower() - action_html = f'
$ {f["aws_action"]}
' if show_action and f["aws_action"] else "" - saving = f"${f['monthly_saving']:,.2f}/mo opportunity" if f["monthly_saving"] > 0 else "Investigate" - st.markdown(f""" -
-
FINDING #{f['rank']}
-
{f['name']}
-
{f['plain_english']}
-
Impact: {f['business_impact']}
-
- {f['severity']} - ๐Ÿท {f['category']} - ๐Ÿ’ฐ {saving} -
-
๐Ÿ”ง {f['priority_action']}
- {action_html} -
""", unsafe_allow_html=True) - - # Service insights - sb = report.get("service_breakdown", {}) - if sb: - st.markdown("---") - st.markdown("#### ๐Ÿ“Š Service insights") - si1, si2 = st.columns(2) - with si1: - if sb.get("biggest_concern"): st.error(f"๐Ÿšจ **Biggest concern:** {sb['biggest_concern']}") - if sb.get("most_improved"): st.success(f"โœ… **Most improved:** {sb['most_improved']}") - with si2: - if sb.get("watch_list"): st.warning(f"๐Ÿ‘€ **Watch list:** {', '.join(sb['watch_list'])}") +aws_tab, gcp_tab = st.tabs(["โ˜๏ธ AWS โ€” Sample Account", "๐ŸŒ GCP โ€” Perforce ($359K/mo)"]) - st.markdown("---") - st.markdown("#### ๐Ÿ“‹ Leadership recommendation") - st.info(report.get("closing_recommendation", "")) +with aws_tab: + # โ”€โ”€ MAIN LAYOUT: left 62% content | right 38% chatbot โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + main_col, chat_col = st.columns([0.62, 0.38]) - # โ”€โ”€ Untagged Resources Panel โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - st.markdown("---") - st.markdown("#### ๐Ÿท๏ธ Untagged Resources") - - @st.cache_data - def load_resource_csv(): - """Load converted_costs.csv if available, else fall back to aws_cost_data.csv.""" - for candidate in ["converted_costs.csv", "aws_cost_data.csv"]: - path = os.environ.get("GHOSTBUSTERS_CSV", candidate) - if os.path.exists(path): - try: - return pd.read_csv(path) - except Exception: - continue - return pd.DataFrame() - - rdf = load_resource_csv() - - if rdf.empty: - st.info("No resource CSV loaded. Run the detection pipeline first.") - else: - # Identify untagged: team is 'untagged', missing, or environment is 'unknown' - def is_untagged(row): - team = str(row.get("team", "")).strip().lower() - env = str(row.get("environment", "")).strip().lower() - tags = str(row.get("tags", "")).strip().lower() - return ( - team in ("untagged", "", "nan", "none") or - env in ("unknown", "", "nan", "none") or - tags in ("source:cost-explorer", "", "nan", "none") - ) - - untagged_df = rdf[rdf.apply(is_untagged, axis=1)].copy() - tagged_df = rdf[~rdf.apply(is_untagged, axis=1)].copy() - - total_resources = len(rdf) - untagged_count = len(untagged_df) - untagged_cost = untagged_df["monthly_cost_usd"].sum() if "monthly_cost_usd" in untagged_df.columns else 0 - total_cost = rdf["monthly_cost_usd"].sum() if "monthly_cost_usd" in rdf.columns else 0 - untagged_pct = round(untagged_count / total_resources * 100, 1) if total_resources else 0 - untagged_cost_pct= round(untagged_cost / total_cost * 100, 1) if total_cost else 0 - - # Metric cards row - ut1, ut2, ut3, ut4 = st.columns(4) - with ut1: + with main_col: + # Metric cards + c1, c2, c3, c4 = st.columns(4) + with c1: st.markdown(f"""
-
Untagged resources
-
{untagged_count}
-
{untagged_pct}% of total
+
Monthly opportunity
+
${total_monthly:,.0f}
+
recoverable now
""", unsafe_allow_html=True) - with ut2: + with c2: st.markdown(f"""
-
Untagged monthly spend
-
${untagged_cost:,.0f}
-
{untagged_cost_pct}% of total spend
+
Annual opportunity
+
${total_annual:,.0f}
+
if unaddressed
""", unsafe_allow_html=True) - with ut3: + with c3: st.markdown(f"""
-
Tagged resources
-
{len(tagged_df)}
-
{100-untagged_pct}% coverage
-
""", unsafe_allow_html=True) - with ut4: - st.markdown(f"""
-
Untagged annual cost
-
${untagged_cost*12:,.0f}
-
no ownership visibility
+
Findings
+
{len(findings)}
+
services flagged
""", unsafe_allow_html=True) + with c4: + if total_spend > 0: + pct = round((total_monthly / total_spend) * 100, 1) + st.markdown(f"""
+
Total spend
+
${total_spend:,.0f}
+
{pct}% recoverable
+
""", unsafe_allow_html=True) + else: + top_f = findings[0] if findings else {} + st.markdown(f"""
+
Top finding
+
{top_f.get('name','โ€”')[:12]}
+
${top_f.get('monthly_saving',0):,.0f}/mo
+
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) - if untagged_count == 0: - st.success("โœ… All resources are tagged. Great governance!") - else: - st.markdown( - f'
โš ๏ธ {untagged_count} resources ({untagged_pct}%) have no team/environment tags โ€” ' - f'accounting for ${untagged_cost:,.2f}/mo of spend with no ownership visibility. ' - f'Without tags you cannot chargeback costs, enforce policies, or identify owners when issues arise.
', - unsafe_allow_html=True - ) - - # Donut: tagged vs untagged by cost - ut_chart_l, ut_chart_r = st.columns(2) - with ut_chart_l: - st.markdown("**Tagging coverage by spend**") - tag_pie = pd.DataFrame([ - {"Status": "Untagged", "Cost": round(untagged_cost, 2)}, - {"Status": "Tagged", "Cost": round(total_cost - untagged_cost, 2)}, + # AI summary + st.markdown(f'
๐Ÿค– AI Summary
{report["executive_summary"]}
', + unsafe_allow_html=True) + + # Charts + chart_l, chart_r = st.columns(2) + with chart_l: + st.markdown("#### Cost by service") + src = raw_services or [] + if src: + svc_df = pd.DataFrame([ + {"Service": s["service"][:22], "April ($)": s["apr_2026"]} + for s in sorted(src, key=lambda x: -x["apr_2026"])[:8] ]) - fig_tag = px.pie(tag_pie, values="Cost", names="Status", - color_discrete_map={"Untagged": "#f59e0b", "Tagged": "#10b981"}, hole=0.5) - fig_tag.update_traces(textposition="outside", textinfo="label+percent") - fig_tag.update_layout(showlegend=False, paper_bgcolor="white", - margin=dict(l=0,r=0,t=10,b=0), height=220) - st.plotly_chart(fig_tag, use_container_width=True) - - with ut_chart_r: - st.markdown("**Untagged spend by service**") - if "service" in untagged_df.columns: - svc_untagged = ( - untagged_df.groupby("service")["monthly_cost_usd"] - .sum().reset_index() - .sort_values("monthly_cost_usd", ascending=True) - .tail(8) - ) - svc_untagged.columns = ["Service", "Cost"] - fig_svc = px.bar(svc_untagged, x="Cost", y="Service", orientation="h", - color="Cost", color_continuous_scale=["#fef3c7", "#f59e0b"], text="Cost") - fig_svc.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") - fig_svc.update_layout(showlegend=False, coloraxis_showscale=False, - plot_bgcolor="white", paper_bgcolor="white", - margin=dict(l=0,r=60,t=10,b=0), height=220, - yaxis=dict(showgrid=False), xaxis=dict(showgrid=True, gridcolor="#f0f0f0")) - st.plotly_chart(fig_svc, use_container_width=True) + fig = px.bar(svc_df, x="April ($)", y="Service", orientation="h", + color="April ($)", color_continuous_scale=["#fde8e8","#e05252"], text="April ($)") + fig.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") + fig.update_layout(showlegend=False, coloraxis_showscale=False, + plot_bgcolor="white", paper_bgcolor="white", + margin=dict(l=0,r=60,t=10,b=0), height=260, + yaxis=dict(showgrid=False), xaxis=dict(showgrid=True,gridcolor="#f0f0f0")) + st.plotly_chart(fig, use_container_width=True) + elif all_f_legacy: + svc_t = {} + for f in all_f_legacy: + svc_t[f.get("service","Other")] = svc_t.get(f.get("service","Other"),0)+f.get("monthly_waste_usd",0) + sdf = pd.DataFrame([{"Service":k,"Waste ($)":round(v,2)} for k,v in sorted(svc_t.items(),key=lambda x:-x[1])]) + fig = px.bar(sdf,x="Waste ($)",y="Service",orientation="h", + color="Waste ($)",color_continuous_scale=["#fde8e8","#e05252"],text="Waste ($)") + fig.update_traces(texttemplate="$%{text:,.0f}",textposition="outside") + fig.update_layout(showlegend=False,coloraxis_showscale=False, + plot_bgcolor="white",paper_bgcolor="white", + margin=dict(l=0,r=60,t=10,b=0),height=260, + yaxis=dict(showgrid=False),xaxis=dict(showgrid=True,gridcolor="#f0f0f0")) + st.plotly_chart(fig, use_container_width=True) + + with chart_r: + st.markdown("#### Opportunity by category") + cat_t = {} + for f in findings: + cat_t[f["category"]] = cat_t.get(f["category"],0) + f["monthly_saving"] + if cat_t: + cdf = pd.DataFrame([{"Category":k,"Opp ($)":round(v,2)} for k,v in sorted(cat_t.items(),key=lambda x:-x[1]) if v>0]) + fig2 = px.pie(cdf,values="Opp ($)",names="Category", + color_discrete_sequence=["#e05252","#f59e0b","#3b82f6","#8b5cf6","#10b981"],hole=0.45) + fig2.update_traces(textposition="outside",textinfo="label+percent") + fig2.update_layout(showlegend=False,paper_bgcolor="white", + margin=dict(l=0,r=0,t=10,b=0),height=260) + st.plotly_chart(fig2, use_container_width=True) + + # Quick wins + if quick_wins: + st.markdown("#### โšก Quick wins") + for w in quick_wins[:3]: + st.markdown(f'
โœ… {w}
', unsafe_allow_html=True) + + st.markdown("
", unsafe_allow_html=True) + + # Findings + st.markdown("#### ๐Ÿ” Flagged services") + filtered = [f for f in findings if f["category"] in selected_cats and f["severity"] in selected_sev] + if not filtered: + st.info("No findings match filters.") + else: + show_action = st.toggle("Show AWS remediation actions", value=False) + for f in filtered: + sev = f["severity"].lower() + action_html = f'
$ {f["aws_action"]}
' if show_action and f["aws_action"] else "" + saving = f"${f['monthly_saving']:,.2f}/mo opportunity" if f["monthly_saving"] > 0 else "Investigate" + st.markdown(f""" +
+
FINDING #{f['rank']}
+
{f['name']}
+
{f['plain_english']}
+
Impact: {f['business_impact']}
+
+ {f['severity']} + ๐Ÿท {f['category']} + ๐Ÿ’ฐ {saving} +
+
๐Ÿ”ง {f['priority_action']}
+ {action_html} +
""", unsafe_allow_html=True) + + # Service insights + sb = report.get("service_breakdown", {}) + if sb: + st.markdown("---") + st.markdown("#### ๐Ÿ“Š Service insights") + si1, si2 = st.columns(2) + with si1: + if sb.get("biggest_concern"): st.error(f"๐Ÿšจ **Biggest concern:** {sb['biggest_concern']}") + if sb.get("most_improved"): st.success(f"โœ… **Most improved:** {sb['most_improved']}") + with si2: + if sb.get("watch_list"): st.warning(f"๐Ÿ‘€ **Watch list:** {', '.join(sb['watch_list'])}") + + st.markdown("---") + st.markdown("#### ๐Ÿ“‹ Leadership recommendation") + st.info(report.get("closing_recommendation", "")) + + # โ”€โ”€ Untagged Resources Panel โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + st.markdown("---") + st.markdown("#### ๐Ÿท๏ธ Untagged Resources") + + @st.cache_data + def load_resource_csv(): + """Load converted_costs.csv if available, else fall back to aws_cost_data.csv.""" + for candidate in ["converted_costs.csv", "aws_cost_data.csv"]: + path = os.environ.get("GHOSTBUSTERS_CSV", candidate) + if os.path.exists(path): + try: + return pd.read_csv(path) + except Exception: + continue + return pd.DataFrame() + + rdf = load_resource_csv() + + if rdf.empty: + st.info("No resource CSV loaded. Run the detection pipeline first.") + else: + # Identify untagged: team is 'untagged', missing, or environment is 'unknown' + def is_untagged(row): + team = str(row.get("team", "")).strip().lower() + env = str(row.get("environment", "")).strip().lower() + tags = str(row.get("tags", "")).strip().lower() + return ( + team in ("untagged", "", "nan", "none") or + env in ("unknown", "", "nan", "none") or + tags in ("source:cost-explorer", "", "nan", "none") + ) + + untagged_df = rdf[rdf.apply(is_untagged, axis=1)].copy() + tagged_df = rdf[~rdf.apply(is_untagged, axis=1)].copy() + + total_resources = len(rdf) + untagged_count = len(untagged_df) + untagged_cost = untagged_df["monthly_cost_usd"].sum() if "monthly_cost_usd" in untagged_df.columns else 0 + total_cost = rdf["monthly_cost_usd"].sum() if "monthly_cost_usd" in rdf.columns else 0 + untagged_pct = round(untagged_count / total_resources * 100, 1) if total_resources else 0 + untagged_cost_pct= round(untagged_cost / total_cost * 100, 1) if total_cost else 0 + + # Metric cards row + ut1, ut2, ut3, ut4 = st.columns(4) + with ut1: + st.markdown(f"""
+
Untagged resources
+
{untagged_count}
+
{untagged_pct}% of total
+
""", unsafe_allow_html=True) + with ut2: + st.markdown(f"""
+
Untagged monthly spend
+
${untagged_cost:,.0f}
+
{untagged_cost_pct}% of total spend
+
""", unsafe_allow_html=True) + with ut3: + st.markdown(f"""
+
Tagged resources
+
{len(tagged_df)}
+
{100-untagged_pct}% coverage
+
""", unsafe_allow_html=True) + with ut4: + st.markdown(f"""
+
Untagged annual cost
+
${untagged_cost*12:,.0f}
+
no ownership visibility
+
""", unsafe_allow_html=True) - # Table of untagged resources - st.markdown("
", unsafe_allow_html=True) - st.markdown("**Resources missing tags** โ€” sorted by monthly cost") - - show_cols = [c for c in ["resource_id","resource_name","service","region","team","environment","monthly_cost_usd","tags"] if c in untagged_df.columns] - display_df = ( - untagged_df[show_cols] - .sort_values("monthly_cost_usd", ascending=False) - .reset_index(drop=True) - ) - display_df.index += 1 - - # Search filter - tag_search = st.text_input("๐Ÿ” Filter by resource ID or service", placeholder="e.g. vol- or EC2", key="tag_search") - if tag_search: - mask = display_df.apply(lambda row: tag_search.lower() in str(row).lower(), axis=1) - display_df = display_df[mask] - - st.dataframe( - display_df, - use_container_width=True, - height=min(400, 40 + len(display_df) * 35), - column_config={ - "monthly_cost_usd": st.column_config.NumberColumn("Monthly Cost ($)", format="$%.2f"), - "resource_id": st.column_config.TextColumn("Resource ID"), - "resource_name": st.column_config.TextColumn("Name"), - "service": st.column_config.TextColumn("Service"), - "region": st.column_config.TextColumn("Region"), - "team": st.column_config.TextColumn("Team"), - "environment": st.column_config.TextColumn("Environment"), - "tags": st.column_config.TextColumn("Tags"), - } - ) - - # Tagging CLI helper st.markdown("
", unsafe_allow_html=True) - st.markdown("**Fix it โ€” bulk tag via AWS CLI:**") - top_untagged = untagged_df.sort_values("monthly_cost_usd", ascending=False).head(3) - for _, row in top_untagged.iterrows(): - rid = row.get("resource_id", "") - region = row.get("region", "us-east-1") - svc = str(row.get("service", "")).lower() - if "ec2" in svc or rid.startswith(("i-", "vol-", "snap-")): - cli = f"aws ec2 create-tags --resources {rid} --tags Key=team,Value=your-team Key=environment,Value=prod Key=owner,Value=your-name --region {region}" - elif "rds" in svc: - cli = f"aws rds add-tags-to-resource --resource-name {rid} --tags Key=team,Value=your-team Key=environment,Value=prod --region {region}" - elif "s3" in svc: - cli = f"aws s3api put-bucket-tagging --bucket {rid} --tagging 'TagSet=[{{Key=team,Value=your-team}},{{Key=environment,Value=prod}}]'" - else: - cli = f"aws resourcegroupstaggingapi tag-resources --resource-arn-list {rid} --tags team=your-team,environment=prod,owner=your-name --region {region}" + + if untagged_count == 0: + st.success("โœ… All resources are tagged. Great governance!") + else: st.markdown( - f'
$ {cli}
', + f'
โš ๏ธ {untagged_count} resources ({untagged_pct}%) have no team/environment tags โ€” ' + f'accounting for ${untagged_cost:,.2f}/mo of spend with no ownership visibility. ' + f'Without tags you cannot chargeback costs, enforce policies, or identify owners when issues arise.
', unsafe_allow_html=True ) - st.markdown("---") - st.caption("Built for Perforce Global Jam 2026 ยท Team Ghost Busters ยท Cloud Cost Waste Hunter") - -# โ”€โ”€ RIGHT PANEL: FinOps AI Chatbot โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -with chat_col: - st.markdown(""" -
-
๐Ÿค– FinOps AI
-
- Ask anything about your AWS costs + # Donut: tagged vs untagged by cost + ut_chart_l, ut_chart_r = st.columns(2) + with ut_chart_l: + st.markdown("**Tagging coverage by spend**") + tag_pie = pd.DataFrame([ + {"Status": "Untagged", "Cost": round(untagged_cost, 2)}, + {"Status": "Tagged", "Cost": round(total_cost - untagged_cost, 2)}, + ]) + fig_tag = px.pie(tag_pie, values="Cost", names="Status", + color_discrete_map={"Untagged": "#f59e0b", "Tagged": "#10b981"}, hole=0.5) + fig_tag.update_traces(textposition="outside", textinfo="label+percent") + fig_tag.update_layout(showlegend=False, paper_bgcolor="white", + margin=dict(l=0,r=0,t=10,b=0), height=220) + st.plotly_chart(fig_tag, use_container_width=True) + + with ut_chart_r: + st.markdown("**Untagged spend by service**") + if "service" in untagged_df.columns: + svc_untagged = ( + untagged_df.groupby("service")["monthly_cost_usd"] + .sum().reset_index() + .sort_values("monthly_cost_usd", ascending=True) + .tail(8) + ) + svc_untagged.columns = ["Service", "Cost"] + fig_svc = px.bar(svc_untagged, x="Cost", y="Service", orientation="h", + color="Cost", color_continuous_scale=["#fef3c7", "#f59e0b"], text="Cost") + fig_svc.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") + fig_svc.update_layout(showlegend=False, coloraxis_showscale=False, + plot_bgcolor="white", paper_bgcolor="white", + margin=dict(l=0,r=60,t=10,b=0), height=220, + yaxis=dict(showgrid=False), xaxis=dict(showgrid=True, gridcolor="#f0f0f0")) + st.plotly_chart(fig_svc, use_container_width=True) + + # Table of untagged resources + st.markdown("
", unsafe_allow_html=True) + st.markdown("**Resources missing tags** โ€” sorted by monthly cost") + + show_cols = [c for c in ["resource_id","resource_name","service","region","team","environment","monthly_cost_usd","tags"] if c in untagged_df.columns] + display_df = ( + untagged_df[show_cols] + .sort_values("monthly_cost_usd", ascending=False) + .reset_index(drop=True) + ) + display_df.index += 1 + + # Search filter + tag_search = st.text_input("๐Ÿ” Filter by resource ID or service", placeholder="e.g. vol- or EC2", key="tag_search") + if tag_search: + mask = display_df.apply(lambda row: tag_search.lower() in str(row).lower(), axis=1) + display_df = display_df[mask] + + st.dataframe( + display_df, + use_container_width=True, + height=min(400, 40 + len(display_df) * 35), + column_config={ + "monthly_cost_usd": st.column_config.NumberColumn("Monthly Cost ($)", format="$%.2f"), + "resource_id": st.column_config.TextColumn("Resource ID"), + "resource_name": st.column_config.TextColumn("Name"), + "service": st.column_config.TextColumn("Service"), + "region": st.column_config.TextColumn("Region"), + "team": st.column_config.TextColumn("Team"), + "environment": st.column_config.TextColumn("Environment"), + "tags": st.column_config.TextColumn("Tags"), + } + ) + + # Tagging CLI helper + st.markdown("
", unsafe_allow_html=True) + st.markdown("**Fix it โ€” bulk tag via AWS CLI:**") + top_untagged = untagged_df.sort_values("monthly_cost_usd", ascending=False).head(3) + for _, row in top_untagged.iterrows(): + rid = row.get("resource_id", "") + region = row.get("region", "us-east-1") + svc = str(row.get("service", "")).lower() + if "ec2" in svc or rid.startswith(("i-", "vol-", "snap-")): + cli = f"aws ec2 create-tags --resources {rid} --tags Key=team,Value=your-team Key=environment,Value=prod Key=owner,Value=your-name --region {region}" + elif "rds" in svc: + cli = f"aws rds add-tags-to-resource --resource-name {rid} --tags Key=team,Value=your-team Key=environment,Value=prod --region {region}" + elif "s3" in svc: + cli = f"aws s3api put-bucket-tagging --bucket {rid} --tagging 'TagSet=[{{Key=team,Value=your-team}},{{Key=environment,Value=prod}}]'" + else: + cli = f"aws resourcegroupstaggingapi tag-resources --resource-arn-list {rid} --tags team=your-team,environment=prod,owner=your-name --region {region}" + st.markdown( + f'
$ {cli}
', + unsafe_allow_html=True + ) + + st.markdown("---") + st.caption("Built for Perforce Global Jam 2026 ยท Team Ghost Busters ยท Cloud Cost Waste Hunter") + + # โ”€โ”€ RIGHT PANEL: FinOps AI Chatbot โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + with chat_col: + st.markdown(""" +
+
๐Ÿค– FinOps AI
+
+ Ask anything about your AWS costs +
-
- """, unsafe_allow_html=True) - - # Suggested questions - suggestions = [ - "Which service should I fix first?", - "Why did EC2-Other spike?", - "How much can we save on Neptune?", - "What is the DevOpsAgent charge?", - "Give me a 3-step action plan", - ] - st.markdown("

๐Ÿ’ก Suggested questions:

", - unsafe_allow_html=True) - for i, sug in enumerate(suggestions): - if st.button(sug, key=f"sug_{i}", use_container_width=True): - st.session_state.chat_history.append({"role":"user","content":sug}) + """, unsafe_allow_html=True) + + # Suggested questions + suggestions = [ + "Which service should I fix first?", + "Why did EC2-Other spike?", + "How much can we save on Neptune?", + "What is the DevOpsAgent charge?", + "Give me a 3-step action plan", + ] + st.markdown("

๐Ÿ’ก Suggested questions:

", + unsafe_allow_html=True) + for i, sug in enumerate(suggestions): + if st.button(sug, key=f"sug_{i}", use_container_width=True): + st.session_state.chat_history.append({"role":"user","content":sug}) + with st.spinner("Thinking..."): + ans = call_claude(st.session_state.chat_history) + st.session_state.chat_history.append({"role":"assistant","content":ans}) + + st.markdown("
", unsafe_allow_html=True) + + # Chat history + for msg in st.session_state.chat_history: + if msg["role"] == "user": + st.markdown( + f"
You: {msg['content']}
", + unsafe_allow_html=True) + else: + st.markdown( + f"
๐Ÿค– FinOps AI: {msg['content']}
", + unsafe_allow_html=True) + + st.markdown("
", unsafe_allow_html=True) + + # Input + if prompt_input := st.chat_input("Ask about your AWS costs..."): + st.session_state.chat_history.append({"role":"user","content":prompt_input}) with st.spinner("Thinking..."): ans = call_claude(st.session_state.chat_history) st.session_state.chat_history.append({"role":"assistant","content":ans}) + st.rerun() - st.markdown("
", unsafe_allow_html=True) + if st.session_state.chat_history: + if st.button("๐Ÿ—‘๏ธ Clear chat", use_container_width=True): + st.session_state.chat_history = [] + st.rerun() - # Chat history - for msg in st.session_state.chat_history: - if msg["role"] == "user": - st.markdown( - f"
You: {msg['content']}
", - unsafe_allow_html=True) +with gcp_tab: + # โ”€โ”€ GCP LAYOUT โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + gcp_main, gcp_chat = st.columns([0.62, 0.38]) + + with gcp_main: + _gcp_report_path = os.environ.get("GHOSTBUSTERS_GCP_REPORT", "gcp_report.json") + + @st.cache_data + def load_gcp_report(path=_gcp_report_path): + if not os.path.exists(path): + return None + with open(path) as _f: + return json.load(_f) + + gr = load_gcp_report() + + if gr is None: + st.info("No GCP report found. Run the pipeline:") + st.code("python3 gcp_detection_engine.py\nexport ANTHROPIC_API_KEY=$(cat SM_api_key) && python3 gcp_analyzer.py", language="bash") else: + g_spend = gr.get("total_monthly_spend", 0) + g_opp = gr.get("total_monthly_opportunity", 0) + g_annual = gr.get("total_annual_opportunity", g_opp * 12) + g_saved = gr.get("savings_already_applied", gr.get("savings_efficiency_pct", 0)) + g_findings= gr.get("findings", []) + g_list = gr.get("total_list_cost", 0) + g_eff = gr.get("savings_efficiency_pct", 0) + + # Source badge + if gr.get("source"): + st.markdown(f'''๐Ÿ“Š {gr["source"]}''', unsafe_allow_html=True) + + # Metric cards + gc1, gc2, gc3, gc4 = st.columns(4) + with gc1: + st.markdown(f'''
+
GCP Monthly Spend
+
${g_spend:,.0f}
+
after SADA discounts
+
''', unsafe_allow_html=True) + with gc2: + st.markdown(f'''
+
SADA Savings Applied
+
{g_eff:.0f}%
+
${g_list - g_spend:,.0f} saved/mo
+
''', unsafe_allow_html=True) + with gc3: + st.markdown(f'''
+
Additional Opportunity
+
${g_opp:,.0f}
+
additional/mo
+
''', unsafe_allow_html=True) + with gc4: + st.markdown(f'''
+
Annual Opportunity
+
${g_annual:,.0f}
+
if unaddressed
+
''', unsafe_allow_html=True) + + st.markdown("
", unsafe_allow_html=True) + + # AI Summary st.markdown( - f"
๐Ÿค– FinOps AI: {msg['content']}
", + f'''
๐Ÿค– AI Summary
{gr.get("executive_summary","")}
''', unsafe_allow_html=True) - st.markdown("
", unsafe_allow_html=True) + # Charts row + gc_l, gc_r = st.columns(2) + with gc_l: + st.markdown("#### GCP spend by service") + svcs = gr.get("raw_services", []) + if svcs: + svc_df = pd.DataFrame([ + {"Service": s["service"][:28], "Monthly ($)": round(s["subtotal"], 2)} + for s in sorted(svcs, key=lambda x: -x["subtotal"])[:10] + ]) + fig_g = px.bar(svc_df, x="Monthly ($)", y="Service", orientation="h", + color="Monthly ($)", color_continuous_scale=["#e0f2fe","#0ea5e9"], text="Monthly ($)") + fig_g.update_traces(texttemplate="$%{text:,.0f}", textposition="outside") + fig_g.update_layout(showlegend=False, coloraxis_showscale=False, + plot_bgcolor="white", paper_bgcolor="white", + margin=dict(l=0,r=70,t=10,b=0), height=300, + yaxis=dict(showgrid=False), xaxis=dict(showgrid=True, gridcolor="#f0f0f0")) + st.plotly_chart(fig_g, use_container_width=True) + + with gc_r: + st.markdown("#### List cost vs actual spend") + svcs = gr.get("raw_services", []) + if svcs: + top5 = sorted(svcs, key=lambda x: -x["subtotal"])[:6] + savings_df = pd.DataFrame([ + {"Service": s["service"][:20], "Type": "Actual (after savings)", "Cost": round(s["subtotal"], 2)} + for s in top5 + ] + [ + {"Service": s["service"][:20], "Type": "SADA Savings", "Cost": round(s.get("total_savings", 0), 2)} + for s in top5 + ]) + fig_s = px.bar(savings_df, x="Cost", y="Service", color="Type", orientation="h", + color_discrete_map={"Actual (after savings)": "#0ea5e9", "SADA Savings": "#10b981"}, + barmode="stack") + fig_s.update_layout(showlegend=True, plot_bgcolor="white", paper_bgcolor="white", + margin=dict(l=0,r=70,t=10,b=0), height=300, + yaxis=dict(showgrid=False), xaxis=dict(showgrid=True, gridcolor="#f0f0f0"), + legend=dict(orientation="h", yanchor="bottom", y=1.02)) + st.plotly_chart(fig_s, use_container_width=True) + + # MoM trend chart + svcs = gr.get("raw_services", []) + trend_svcs = [s for s in svcs if s.get("pct_change") is not None and abs(s["pct_change"]) > 5 and s["subtotal"] > 100] + if trend_svcs: + st.markdown("#### Month-over-month spend changes") + trend_df = pd.DataFrame([ + {"Service": s["service"][:28], "Change (%)": s["pct_change"]} + for s in sorted(trend_svcs, key=lambda x: x["pct_change"]) + ]) + colors = ["#ef4444" if v > 0 else "#10b981" for v in trend_df["Change (%)"]] + fig_t = px.bar(trend_df, x="Change (%)", y="Service", orientation="h", + color="Change (%)", + color_continuous_scale=[[0,"#10b981"],[0.5,"#f59e0b"],[1,"#ef4444"]], + text="Change (%)") + fig_t.update_traces(texttemplate="%{text:+.0f}%", textposition="outside") + fig_t.add_vline(x=0, line_width=1, line_color="#94a3b8") + fig_t.update_layout(showlegend=False, coloraxis_showscale=False, + plot_bgcolor="white", paper_bgcolor="white", + margin=dict(l=0,r=70,t=10,b=0), height=max(220, len(trend_df)*32), + yaxis=dict(showgrid=False), xaxis=dict(showgrid=True, gridcolor="#f0f0f0")) + st.plotly_chart(fig_t, use_container_width=True) + + # AI Spend insight + ai_insight = gr.get("ai_spend_insight", "") + if ai_insight: + st.markdown("---") + st.markdown("#### ๐Ÿค– AI / LLM Spend") + st.markdown(f'''
{ai_insight}
''', unsafe_allow_html=True) + # AI services breakdown + ai_svcs = [s for s in gr.get("raw_services",[]) if any(kw in s["service"].lower() for kw in ["claude","gemini","vertex","dialogflow"])] + if ai_svcs: + ai_total = sum(s["subtotal"] for s in ai_svcs) + ai_cols = st.columns(len(ai_svcs[:4])) + for col, svc in zip(ai_cols, sorted(ai_svcs, key=lambda x: -x["subtotal"])[:4]): + pct = svc.get("pct_change") + pct_str = f"{pct:+.0f}% MoM" if pct is not None else "New" + with col: + st.markdown(f'''
+
{svc["service"][:22]}
+
${svc["subtotal"]:,.0f}
+
{pct_str}
+
''', unsafe_allow_html=True) + + # Quick wins + qw = gr.get("quick_wins", []) + if qw: + st.markdown("---") + st.markdown("#### โšก Quick wins") + for w in qw[:3]: + st.markdown(f'''
โœ… {w}
''', unsafe_allow_html=True) - # Input - if prompt_input := st.chat_input("Ask about your AWS costs..."): - st.session_state.chat_history.append({"role":"user","content":prompt_input}) - with st.spinner("Thinking..."): - ans = call_claude(st.session_state.chat_history) - st.session_state.chat_history.append({"role":"assistant","content":ans}) - st.rerun() + st.markdown("
", unsafe_allow_html=True) - if st.session_state.chat_history: - if st.button("๐Ÿ—‘๏ธ Clear chat", use_container_width=True): - st.session_state.chat_history = [] - st.rerun() + # Findings + st.markdown("#### ๐Ÿ” GCP findings") + if not g_findings: + st.info("No findings in report.") + else: + show_gcp_actions = st.toggle("Show GCP remediation actions", value=False, key="gcp_actions_toggle") + for f in g_findings: + sev = f.get("severity","MEDIUM").lower() + opp = f.get("monthly_opportunity", 0) + saving_str = f"${opp:,.2f}/mo opportunity" if opp > 0 else "Investigate" + gcp_action = f.get("gcp_action", "") + action_html = f'''
$ {gcp_action}
''' if show_gcp_actions and gcp_action else "" + st.markdown(f""" +
+
FINDING #{f.get("rank","")}
+
{f.get("service","")}
+
{f.get("plain_english","")}
+
Impact: {f.get("business_impact","")}
+
+ {f.get("severity","MEDIUM")} + ๐Ÿท {f.get("category","")} + ๐Ÿ’ฐ {saving_str} +
+
๐Ÿ”ง {f.get("priority_action","")}
+ {action_html} +
""", unsafe_allow_html=True) + + # SADA savings assessment + leadership recommendation + sada_insight = gr.get("sada_savings_assessment","") + if sada_insight: + st.markdown("---") + st.markdown("#### ๐Ÿ’ฐ SADA Discount Assessment") + st.success(sada_insight) + + st.markdown("---") + st.markdown("#### ๐Ÿ“‹ Leadership recommendation") + st.info(gr.get("closing_recommendation","")) + + st.markdown("---") + st.caption("Built for Perforce Global Jam 2026 ยท Team Ghost Busters ยท Cloud Cost Waste Hunter") + + with gcp_chat: + st.markdown(""" +
+
๐Ÿค– GCP FinOps AI
+
+ Ask anything about your GCP costs +
+
+ """, unsafe_allow_html=True) + + if gr is None: + st.info("Run the GCP pipeline to enable this chatbot.") + else: + def build_gcp_context(): + lines = [ + "You are a senior FinOps engineer assistant for GCP cloud costs at Perforce.", + "Answer clearly and concisely, grounding every answer in the actual billing data below.", + "Keep answers to 3-5 sentences unless the user asks for detail.", + "", + f"Data source: {gr.get('source','SADA GCP Billing Report')}", + f"Total monthly GCP spend: ${gr.get('total_monthly_spend',0):,.2f}", + f"Additional monthly opportunity: ${gr.get('total_monthly_opportunity',0):,.2f}", + f"Executive summary: {gr.get('executive_summary','')}", + "", + "FINDINGS:", + ] + for fi in g_findings: + lines.append( + f"#{fi.get('rank','')} {fi.get('service','')} | " + f"${fi.get('monthly_opportunity',0):,.2f}/mo | " + f"{fi.get('plain_english','')[:120]}" + ) + lines += ["", "QUICK WINS:"] + [f"- {w}" for w in gr.get("quick_wins",[])] + lines.append(f"AI insight: {gr.get('ai_spend_insight','')}") + return "\n".join(l for l in lines if l is not None) + + def call_claude_gcp(messages): + api_key = os.environ.get("ANTHROPIC_API_KEY","") + if not api_key: + return "โš ๏ธ ANTHROPIC_API_KEY not set." + try: + payload = json.dumps({ + "model": "claude-sonnet-4-20250514", + "max_tokens": 800, + "system": build_gcp_context(), + "messages": messages + }).encode() + req = urllib.request.Request( + "https://api.anthropic.com/v1/messages", + data=payload, + headers={"Content-Type":"application/json","x-api-key":api_key,"anthropic-version":"2023-06-01"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=30) as resp: + data = json.loads(resp.read().decode()) + return data["content"][0]["text"] + except Exception as e: + return f"โŒ Error: {e}" + + if "gcp_chat_history" not in st.session_state: + st.session_state.gcp_chat_history = [] + + gcp_suggestions = [ + "What is driving our GCP spend?", + "Why is Claude Sonnet so expensive?", + "How much are we saving with SADA?", + "What can we do about Compute Engine costs?", + "Give me a GCP cost reduction plan", + ] + st.markdown("

๐Ÿ’ก Suggested questions:

", unsafe_allow_html=True) + for i, sug in enumerate(gcp_suggestions): + if st.button(sug, key=f"gcp_sug_{i}", use_container_width=True): + st.session_state.gcp_chat_history.append({"role":"user","content":sug}) + with st.spinner("Thinking..."): + ans = call_claude_gcp(st.session_state.gcp_chat_history) + st.session_state.gcp_chat_history.append({"role":"assistant","content":ans}) + + st.markdown("
", unsafe_allow_html=True) + for msg in st.session_state.gcp_chat_history: + if msg["role"] == "user": + st.markdown(f"
You: {msg['content']}
", unsafe_allow_html=True) + else: + st.markdown(f"
๐Ÿค– GCP AI: {msg['content']}
", unsafe_allow_html=True) + st.markdown("
", unsafe_allow_html=True) + + if prompt_gcp := st.chat_input("Ask about your GCP costs...", key="gcp_chat_input"): + st.session_state.gcp_chat_history.append({"role":"user","content":prompt_gcp}) + with st.spinner("Thinking..."): + ans = call_claude_gcp(st.session_state.gcp_chat_history) + st.session_state.gcp_chat_history.append({"role":"assistant","content":ans}) + st.rerun() + + if st.session_state.gcp_chat_history: + if st.button("๐Ÿ—‘๏ธ Clear GCP chat", use_container_width=True, key="clear_gcp_chat"): + st.session_state.gcp_chat_history = [] + st.rerun() diff --git a/gcp_analyzer.py b/gcp_analyzer.py new file mode 100644 index 0000000..675e1f5 --- /dev/null +++ b/gcp_analyzer.py @@ -0,0 +1,153 @@ +""" +GhostBusters โ€” GCP LLM Analyzer +Sends gcp_findings.json to Claude and produces gcp_report.json +""" + +import json +import re +import os +import urllib.request +from datetime import datetime + + +def load_gcp_findings(filepath: str = "gcp_findings.json") -> dict: + with open(filepath) as f: + return json.load(f) + + +def build_gcp_prompt(data: dict) -> str: + total_spend = data["total_monthly_spend"] + total_list = data["total_list_cost"] + total_saved = data["total_savings_applied"] + efficiency = data["savings_efficiency_pct"] + opportunity = data["total_monthly_opportunity"] + n_findings = data["total_findings"] + source = data["source"] + + top_findings = data["findings"][:12] + findings_txt = json.dumps(top_findings, indent=2) + + # Build AI spend summary + ai_findings = [f for f in data["findings"] if "AI" in f.get("category","") or "claude" in f.get("service","").lower() or "gemini" in f.get("service","").lower()] + ai_txt = json.dumps(ai_findings, indent=2) if ai_findings else "none" + + return f"""You are a senior FinOps engineer and GCP cloud cost analyst at Perforce. +You are analyzing a SADA billing report for the period covered by: {source} + +ACCOUNT SUMMARY: +- Cloud provider: GCP (via SADA reseller) +- Total monthly spend (after discounts): ${total_spend:,.2f} +- Total list cost (before any discounts): ${total_list:,.2f} +- Total savings applied by SADA: ${total_saved:,.2f} ({efficiency}% discount rate) +- Additional optimization opportunity identified: ${opportunity:,.2f}/mo +- Total findings: {n_findings} +- Waste by category: {json.dumps(data.get("waste_by_category", {}))} + +TOP FINDINGS (ranked by opportunity): +{findings_txt} + +AI/LLM SPEND: +{ai_txt} + +Respond ONLY with a valid JSON object โ€” no preamble, no markdown fences. Use this exact schema: +{{ + "executive_summary": "3-sentence summary a CTO would read. Include total GCP spend, what SADA already saves, remaining opportunity, and biggest risk.", + "total_monthly_spend": , + "total_monthly_opportunity": , + "total_annual_opportunity": , + "savings_already_applied": , + "findings": [ + {{ + "rank": , + "service": "", + "category": "", + "severity": "HIGH|MEDIUM|LOW", + "monthly_cost": , + "monthly_opportunity": , + "plain_english": "2-sentence explanation of the issue and why it costs money. No jargon.", + "business_impact": "1 sentence on the business risk if left unaddressed.", + "priority_action": "One concrete action the team should take this week.", + "gcp_action": "" + }} + ], + "quick_wins": ["3 GCP-specific actions the team can take today that require zero downtime"], + "ai_spend_insight": "2-sentence insight specifically about AI/LLM spend (Claude, Vertex, Gemini). Include trend and recommendation.", + "sada_savings_assessment": "1-sentence assessment of how well SADA negotiated discounts and whether there is room to push for more.", + "service_breakdown": {{ + "biggest_concern": "", + "most_improved": "", + "watch_list": ["", "", ""] + }}, + "closing_recommendation": "2-sentence closing advice for engineering leadership on GCP cost governance." +}}""" + + +def call_claude(prompt: str) -> str: + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + if not api_key: + raise ValueError("ANTHROPIC_API_KEY not set. Run: export ANTHROPIC_API_KEY='sk-ant-...'") + + payload = json.dumps({ + "model": "claude-sonnet-4-20250514", + "max_tokens": 4000, + "messages": [{"role": "user", "content": prompt}] + }).encode() + + req = urllib.request.Request( + "https://api.anthropic.com/v1/messages", + data=payload, + headers={ + "Content-Type": "application/json", + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + }, + method="POST" + ) + with urllib.request.urlopen(req, timeout=60) as resp: + data = json.loads(resp.read().decode()) + return data["content"][0]["text"] + + +def extract_json(raw: str) -> dict: + raw = raw.strip() + # Strip markdown fences if present + raw = re.sub(r"^```(?:json)?\s*", "", raw) + raw = re.sub(r"\s*```$", "", raw) + return json.loads(raw) + + +def analyze(findings_path: str = "gcp_findings.json", output_path: str = "gcp_report.json"): + print(f"[GhostBusters GCP Analyzer] Loading {findings_path}") + data = load_gcp_findings(findings_path) + + # Pass raw findings data through to report + prompt = build_gcp_prompt(data) + + print("[GhostBusters GCP Analyzer] Calling Claude...") + raw = call_claude(prompt) + + report = extract_json(raw) + + # Enrich with raw data for dashboard charts + report["source"] = data.get("source", "SADA GCP Billing Report") + report["cloud"] = "GCP" + report["generated_at"] = datetime.now().isoformat() + report["raw_services"] = data.get("services", []) + report["all_findings"] = data.get("findings", []) + report["total_list_cost"] = data.get("total_list_cost", 0) + report["savings_efficiency_pct"] = data.get("savings_efficiency_pct", 0) + + with open(output_path, "w") as fh: + json.dump(report, fh, indent=2) + + print(f"[GhostBusters GCP Analyzer] Report written to {output_path}") + print(f" Total spend: ${report.get('total_monthly_spend',0):,.2f}/mo") + print(f" Opportunity: ${report.get('total_monthly_opportunity',0):,.2f}/mo") + print(f" Annual opportunity:${report.get('total_annual_opportunity',0):,.2f}") + return report + + +if __name__ == "__main__": + findings_path = os.environ.get("GHOSTBUSTERS_GCP_FINDINGS", "gcp_findings.json") + output_path = os.environ.get("GHOSTBUSTERS_GCP_REPORT", "gcp_report.json") + analyze(findings_path, output_path) diff --git a/gcp_detection_engine.py b/gcp_detection_engine.py new file mode 100644 index 0000000..ca7aa8f --- /dev/null +++ b/gcp_detection_engine.py @@ -0,0 +1,447 @@ +""" +GhostBusters โ€” GCP Detection Engine +Reads a SADA billing CSV (service-level GCP report) and runs waste / risk detectors. +Output: gcp_findings.json +""" + +import pandas as pd +import json +import os +import re +from datetime import datetime + +# โ”€โ”€โ”€ Thresholds โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +SPIKE_PCT_THRESHOLD = 10.0 # % MoM increase โ†’ flag as spike +DROP_PCT_THRESHOLD = 50.0 # % MoM decrease โ†’ investigate sharp drop +SUPPORT_PCT_OF_SPEND = 3.0 # Support > 3% of total = excessive +AI_SERVICES = { # GCP AI/LLM service keywords + "claude", "gemini", "vertex ai", "vertex", "dialogflow", + "natural language", "vision api", "speech api", "translation", +} +CUD_ELIGIBLE = { # Services that can use Committed Use Discounts + "compute engine", "cloud sql", "kubernetes engine", + "cloud run", "cloud spanner", "bigtable", +} +LOGGING_SPIKE_USD = 5000 # Cloud Logging above this monthly = review retention + +# โ”€โ”€โ”€ CSV loader โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +def load_sada_csv(filepath: str) -> pd.DataFrame: + """ + Parse a SADA GCP billing CSV. Strips summary rows (Subtotal/Tax/Total). + Returns a clean DataFrame with numeric columns coerced. + """ + df = pd.read_csv(filepath) + # Drop summary rows (Service description is blank or is a subtotal marker) + df = df[df["Service description"].notna() & (df["Service description"].str.strip() != "")] + # Drop any row where Service ID is blank (summary lines) + if "Service ID" in df.columns: + df = df[df["Service ID"].notna() & (df["Service ID"].str.strip() != "")] + + # Rename for convenience + df = df.rename(columns={ + "Service description": "service", + "Service ID": "service_id", + "List cost ($)": "list_cost", + "Negotiated savings ($)": "negotiated_savings", + "Savings programs ($)": "savings_programs", + "Other savings ($)": "other_savings", + "Unrounded subtotal ($)": "unrounded_subtotal", + "Subtotal ($)": "subtotal", + "Percent change in subtotal compared to previous period": "pct_change_raw", + }) + + # Coerce numeric + for col in ["list_cost","negotiated_savings","savings_programs","other_savings","subtotal"]: + df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0) + + # Parse percent change โ€” "29%", "-2%", "New", "0%" + def parse_pct(v): + v = str(v).strip() + if v.lower() == "new": + return None # new service, no prior period + m = re.search(r"[-\d.]+", v) + return float(m.group()) if m else None + + df["pct_change"] = df["pct_change_raw"].apply(parse_pct) + df["is_new"] = df["pct_change_raw"].str.strip().str.lower() == "new" + df["total_savings"] = ( + df["negotiated_savings"].abs() + + df["savings_programs"].abs() + + df["other_savings"].abs() + ) + return df.reset_index(drop=True) + + +# โ”€โ”€โ”€ Detectors โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +def detect_spend_spikes(df: pd.DataFrame) -> list[dict]: + """Flag services with MoM increase above threshold.""" + findings = [] + for _, row in df.iterrows(): + pct = row["pct_change"] + if pct is None or pct <= SPIKE_PCT_THRESHOLD: + continue + findings.append({ + "detector": "spend_spike", + "category": "Cost Spike", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": pct, + "severity": "HIGH" if pct > 20 else "MEDIUM", + "plain_english": ( + f"{row['service']} spend grew {pct:+.0f}% month-over-month to " + f"${row['subtotal']:,.2f}/mo. This is above the {SPIKE_PCT_THRESHOLD}% " + f"acceptable variance threshold." + ), + "monthly_opportunity": 0, + "priority_action": ( + f"Investigate what drove the {pct:+.0f}% increase in {row['service']}. " + f"Check for new workloads, quota increases, or billing anomalies in GCP Console โ†’ Billing โ†’ Cost breakdown." + ), + }) + return sorted(findings, key=lambda x: -x["pct_change"]) + + +def detect_sharp_drops(df: pd.DataFrame) -> list[dict]: + """Flag services with >50% MoM drop โ€” may indicate unintentional shutdowns.""" + findings = [] + for _, row in df.iterrows(): + pct = row["pct_change"] + if pct is None or pct >= -DROP_PCT_THRESHOLD: + continue + if row["subtotal"] < 10: # too small to care + continue + findings.append({ + "detector": "sharp_drop", + "category": "Anomaly", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": pct, + "severity": "MEDIUM", + "plain_english": ( + f"{row['service']} dropped {abs(pct):.0f}% MoM to ${row['subtotal']:,.2f}/mo. " + f"Large drops can indicate accidental shutdowns, missing workloads, or billing credit anomalies." + ), + "monthly_opportunity": 0, + "priority_action": ( + f"Verify that the drop in {row['service']} is intentional. " + f"Check GCP Console โ†’ Billing โ†’ Cost table for credits or terminated resources." + ), + }) + return sorted(findings, key=lambda x: x["pct_change"]) + + +def detect_ai_spend(df: pd.DataFrame, total_spend: float) -> list[dict]: + """Aggregate AI/LLM services and flag if they represent a large share of spend.""" + findings = [] + ai_rows = df[df["service"].str.lower().apply( + lambda s: any(kw in s for kw in AI_SERVICES) + )] + if ai_rows.empty: + return findings + + ai_total = ai_rows["subtotal"].sum() + ai_pct = round(ai_total / total_spend * 100, 1) if total_spend else 0 + ai_list = ai_rows.sort_values("subtotal", ascending=False)[["service","subtotal","pct_change"]].to_dict("records") + + # Flag the top AI spender separately + top = ai_rows.loc[ai_rows["subtotal"].idxmax()] + top_pct = top.get("pct_change", 0) or 0 + + findings.append({ + "detector": "ai_spend", + "category": "AI / LLM Cost", + "service": "AI/LLM Services (aggregated)", + "service_id": "AGGREGATED", + "monthly_cost": round(ai_total, 2), + "pct_change": None, + "severity": "HIGH" if ai_pct > 15 else "MEDIUM", + "plain_english": ( + f"AI and LLM services account for ${ai_total:,.2f}/mo ({ai_pct}% of total GCP spend). " + f"Top spender: {top['service']} at ${top['subtotal']:,.2f}/mo " + f"({top_pct:+.0f}% vs last month)." + ), + "monthly_opportunity": 0, + "priority_action": ( + "Review AI API usage logs for unused or test calls. " + "Implement request caching, prompt compression, and consider smaller models " + "for non-critical workloads. Set budget alerts at 80% of expected AI spend." + ), + "breakdown": ai_list, + }) + + # Flag each AI service that's spiking + for _, row in ai_rows.iterrows(): + pct = row["pct_change"] + if pct and pct > SPIKE_PCT_THRESHOLD and row["subtotal"] > 100: + findings.append({ + "detector": "ai_spike", + "category": "AI / LLM Cost", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": pct, + "severity": "HIGH" if pct > 25 else "MEDIUM", + "plain_english": ( + f"{row['service']} grew {pct:+.0f}% MoM to ${row['subtotal']:,.2f}/mo. " + f"Unchecked AI API growth can quickly become the largest cost driver." + ), + "monthly_opportunity": round(row["subtotal"] * 0.25, 2), + "priority_action": ( + f"Audit {row['service']} call volume. Add rate limits, caching, " + f"and model-tier routing (use smaller/cheaper models for drafts). " + f"Set a GCP budget alert on this service." + ), + }) + return findings + + +def detect_cud_opportunity(df: pd.DataFrame) -> list[dict]: + """Flag CUD-eligible services with low/no committed use discounts.""" + findings = [] + for _, row in df.iterrows(): + svc_lower = row["service"].lower() + if not any(kw in svc_lower for kw in CUD_ELIGIBLE): + continue + savings_programs_abs = abs(row["savings_programs"]) + list_cost = row["list_cost"] + if list_cost < 500: + continue + cud_coverage = savings_programs_abs / list_cost if list_cost else 0 + if cud_coverage > 0.40: # already has good CUD coverage + continue + potential = round(list_cost * 0.30, 2) # conservative 30% CUD saving estimate + findings.append({ + "detector": "cud_opportunity", + "category": "Reserved / Committed Use", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": row["pct_change"], + "severity": "HIGH" if list_cost > 5000 else "MEDIUM", + "plain_english": ( + f"{row['service']} has a list cost of ${list_cost:,.2f}/mo with only " + f"{cud_coverage*100:.0f}% covered by Committed Use Discounts. " + f"Purchasing 1-year CUDs could save ~30%." + ), + "monthly_opportunity": potential, + "priority_action": ( + f"Purchase 1-year Committed Use Discounts for {row['service']} in " + f"GCP Console โ†’ Billing โ†’ Committed use discounts. " + f"Estimated saving: ${potential:,.2f}/mo." + ), + }) + return sorted(findings, key=lambda x: -x["monthly_opportunity"]) + + +def detect_excessive_support(df: pd.DataFrame, total_spend: float) -> list[dict]: + """Flag if Support cost exceeds threshold % of total spend.""" + findings = [] + support_rows = df[df["service"].str.lower().str.contains("support")] + if support_rows.empty: + return findings + support_cost = support_rows["subtotal"].sum() + support_pct = round(support_cost / total_spend * 100, 1) if total_spend else 0 + if support_pct < SUPPORT_PCT_OF_SPEND: + return findings + findings.append({ + "detector": "excessive_support", + "category": "Support Overhead", + "service": "Support", + "service_id": support_rows.iloc[0]["service_id"], + "monthly_cost": round(support_cost, 2), + "pct_change": support_rows.iloc[0]["pct_change"], + "severity": "MEDIUM", + "plain_english": ( + f"Support charges are ${support_cost:,.2f}/mo ({support_pct}% of total GCP spend). " + f"This exceeds the {SUPPORT_PCT_OF_SPEND}% benchmark for a well-optimised account." + ), + "monthly_opportunity": round(support_cost * 0.20, 2), + "priority_action": ( + "Review SADA support tier vs. actual tickets opened. " + "Consider downgrading support tier if ticket volume is low, " + "or consolidating support contracts across GCP projects." + ), + }) + return findings + + +def detect_logging_costs(df: pd.DataFrame) -> list[dict]: + """Flag high Cloud Logging costs โ€” often driven by verbose log sinks.""" + findings = [] + log_rows = df[df["service"].str.lower().str.contains("logging")] + if log_rows.empty: + return findings + log_cost = log_rows["subtotal"].sum() + if log_cost < LOGGING_SPIKE_USD: + return findings + findings.append({ + "detector": "logging_costs", + "category": "Log Retention", + "service": "Cloud Logging", + "service_id": log_rows.iloc[0]["service_id"], + "monthly_cost": round(log_cost, 2), + "pct_change": log_rows.iloc[0]["pct_change"], + "severity": "MEDIUM", + "plain_english": ( + f"Cloud Logging costs ${log_cost:,.2f}/mo. " + f"Verbose application logs, audit logs, and VPC flow logs often account for " + f"60-70% of this spend โ€” much of it stored indefinitely." + ), + "monthly_opportunity": round(log_cost * 0.60, 2), + "priority_action": ( + "In GCP Console โ†’ Logging โ†’ Log Router, exclude high-volume low-value logs " + "(DEBUG, INFO for non-critical services). Apply 30-day retention to all non-audit " + "log buckets. Archive to Cloud Storage for compliance if needed." + ), + }) + return findings + + +def detect_new_services(df: pd.DataFrame) -> list[dict]: + """Flag brand-new services that appeared this billing period.""" + findings = [] + new_rows = df[df["is_new"] == True] + for _, row in new_rows.iterrows(): + findings.append({ + "detector": "new_service", + "category": "New Service", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": None, + "severity": "LOW", + "plain_english": ( + f"{row['service']} appeared for the first time this billing period " + f"at ${row['subtotal']:,.2f}. " + f"New services should be reviewed to ensure they are intentional and owned." + ), + "monthly_opportunity": 0, + "priority_action": ( + f"Confirm {row['service']} was intentionally enabled. " + f"Assign an owner, add budget alerts, and tag the associated GCP project." + ), + }) + return findings + + +def detect_unused_savings(df: pd.DataFrame) -> list[dict]: + """ + Flag large services (>$1000/mo list cost) with zero savings programs applied + that are CUD-eligible โ€” they are leaving money on the table. + """ + findings = [] + for _, row in df.iterrows(): + if row["list_cost"] < 1000: + continue + if abs(row["savings_programs"]) > 0: + continue + svc_lower = row["service"].lower() + if not any(kw in svc_lower for kw in CUD_ELIGIBLE): + continue + potential = round(row["list_cost"] * 0.30, 2) + findings.append({ + "detector": "zero_savings_programs", + "category": "Reserved / Committed Use", + "service": row["service"], + "service_id": row["service_id"], + "monthly_cost": round(row["subtotal"], 2), + "pct_change": row["pct_change"], + "severity": "HIGH" if row["list_cost"] > 5000 else "MEDIUM", + "plain_english": ( + f"{row['service']} has a list cost of ${row['list_cost']:,.2f}/mo " + f"with $0 in Savings Programs applied. " + f"Purchasing CUDs could save ~30% (~${potential:,.2f}/mo)." + ), + "monthly_opportunity": potential, + "priority_action": ( + f"Evaluate 1-year or 3-year Committed Use Discounts for " + f"{row['service']} to reduce on-demand pricing." + ), + }) + return sorted(findings, key=lambda x: -x["monthly_opportunity"]) + + +# โ”€โ”€โ”€ Main pipeline โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +def run_gcp_detection(csv_path: str, output_path: str = "gcp_findings.json"): + print(f"[GhostBusters GCP] Loading {csv_path}") + df = load_sada_csv(csv_path) + + total_spend = round(df["subtotal"].sum(), 2) + total_list_cost = round(df["list_cost"].sum(), 2) + total_savings = round(df["total_savings"].sum(), 2) + savings_pct = round(total_savings / total_list_cost * 100, 1) if total_list_cost else 0 + period_start = csv_path # embed filename as source reference + + print(f"[GhostBusters GCP] {len(df)} services, total spend: ${total_spend:,.2f}") + + # Run all detectors + all_findings: list[dict] = [] + all_findings += detect_spend_spikes(df) + all_findings += detect_sharp_drops(df) + all_findings += detect_ai_spend(df, total_spend) + all_findings += detect_cud_opportunity(df) + all_findings += detect_excessive_support(df, total_spend) + all_findings += detect_logging_costs(df) + all_findings += detect_new_services(df) + all_findings += detect_unused_savings(df) + + # De-duplicate: keep highest severity per service per detector type + seen = set() + deduped = [] + for f in all_findings: + key = (f["detector"], f["service"]) + if key not in seen: + seen.add(key) + deduped.append(f) + + # Rank by monthly_opportunity desc, then by severity + sev_order = {"HIGH": 0, "MEDIUM": 1, "LOW": 2} + deduped.sort(key=lambda x: (-x["monthly_opportunity"], sev_order.get(x["severity"], 3))) + for i, f in enumerate(deduped, 1): + f["rank"] = i + + # Services list for dashboard charts + services_list = df.sort_values("subtotal", ascending=False)[ + ["service","service_id","list_cost","subtotal","total_savings","pct_change","pct_change_raw","is_new"] + ].to_dict("records") + + total_opportunity = round(sum(f.get("monthly_opportunity", 0) for f in deduped), 2) + + summary = { + "cloud": "GCP", + "source": os.path.basename(csv_path), + "generated_at": datetime.now().isoformat(), + "total_monthly_spend": total_spend, + "total_list_cost": total_list_cost, + "total_savings_applied":total_savings, + "savings_efficiency_pct": savings_pct, + "total_monthly_opportunity": total_opportunity, + "total_annual_opportunity": round(total_opportunity * 12, 2), + "total_findings": len(deduped), + "services": services_list, + "findings": deduped, + "waste_by_category": { + cat: round(sum(f["monthly_opportunity"] for f in deduped if f["category"] == cat), 2) + for cat in set(f["category"] for f in deduped) + }, + } + + with open(output_path, "w") as fh: + json.dump(summary, fh, indent=2) + + print(f"[GhostBusters GCP] {len(deduped)} findings, ${total_opportunity:,.2f}/mo opportunity โ†’ {output_path}") + return summary + + +if __name__ == "__main__": + sada_csv = os.environ.get( + "GHOSTBUSTERS_GCP_CSV", + "Perforce Software, Inc. - SADA_Reports, 2026-05-01 \u2014 2026-05-26.csv" + ) + out_path = os.environ.get("GHOSTBUSTERS_GCP_FINDINGS", "gcp_findings.json") + run_gcp_detection(sada_csv, out_path)