elon_py/pkg/dash/func/info_func.py

import pytz
from pkg.tool import get_tweets_since_last_friday, aggregate_data
import numpy as np
from scipy.stats import norm
from datetime import timedelta, datetime
from pkg.config import render_data

def get_last_7_days_data():
    est = pytz.timezone('US/Eastern')
    now = datetime.now(est).date()
    last_7_days = [now - timedelta(days=i) for i in range(7)]
    data = render_data.global_agg_df[
        render_data.global_agg_df['date'].isin(last_7_days)].copy()
    return data

def get_hourly_weighted_array():
    data = get_last_7_days_data()
    if data.empty:
        return [1 / 24] * 24

    agg_data = aggregate_data(data, 60)
    one_day_data = agg_data.groupby('interval_group')['tweet_count'].sum().reset_index()
    tweet_count_total = one_day_data['tweet_count'].sum()

    hourly_rates = [0] * 24
    for _, row in one_day_data.iterrows():
        minute = row['interval_group']
        hour = int(minute // 60)
        if hour < 24:
            hourly_rates[hour] = row['tweet_count'] / tweet_count_total if tweet_count_total > 0 else 0

    total_rate = sum(hourly_rates)
    if total_rate > 0:
        hourly_rates = [rate / total_rate for rate in hourly_rates]
    else:
        hourly_rates = [1 / 24] * 24
    return hourly_rates

def calculate_variance_factor():
    data = get_last_7_days_data()
    if data.empty or 'tweet_count' not in data.columns:
        return 1.5

    data['hour'] = data['minute_of_day'] // 60
    hourly_data = data.groupby(['date', 'hour'])['tweet_count'].sum().reset_index()
    hourly_stats = hourly_data.groupby('hour')['tweet_count'].agg(['mean', 'var']).reset_index()
    variance_factors = hourly_stats['var'] / hourly_stats['mean']
    return np.mean(variance_factors[variance_factors.notna()]) or 1.5

def get_dynamic_hourly_weights():
    data = get_last_7_days_data()
    if data.empty:
        return [1 / 24] * 24

    weights = [0.2, 0.2, 0.3, 0.3, 0.5, 0.5, 0.5]
    hourly_rates = [0] * 24

    for day_idx, day in enumerate(get_last_7_days_data()['date'].unique()):
        day_data = data[data['date'] == day].copy()
        if day_data.empty:
            continue
        agg_data = aggregate_data(day_data, 60)
        day_tweets = agg_data.groupby('interval_group')['tweet_count'].sum().reset_index()
        day_total = day_tweets['tweet_count'].sum()
        for _, row in day_tweets.iterrows():
            minute = row['interval_group']
            hour = int(minute // 60)
            if hour < 24:
                hourly_rates[hour] += (row['tweet_count'] / day_total if day_total > 0 else 0) * weights[day_idx % 7]

    total_rate = sum(hourly_rates)
    if total_rate > 0:
        hourly_rates = [rate / total_rate for rate in hourly_rates]
    else:
        hourly_rates = [1 / 24] * 24
    return hourly_rates

def get_pace_params():
    est = pytz.timezone('US/Eastern')
    now = datetime.now(est)
    today = now.date()
    days_to_next_friday = (4 - today.weekday()) % 7
    next_friday = (now.replace(hour=12, minute=0, second=0, microsecond=0) +
                   timedelta(days=days_to_next_friday))
    if now > next_friday:
        next_friday += timedelta(days=7)
    days_to_next_friday = (next_friday - now).total_seconds() / (24 * 60 * 60)
    tweet_count = get_tweets_since_last_friday()
    return tweet_count, days_to_next_friday

def calculate_tweet_pace():
    tweet_count, days_to_next_friday = get_pace_params()
    pace = (tweet_count / (7 - days_to_next_friday)) * days_to_next_friday + tweet_count
    return round(pace, 6) if pace > 0 else float(tweet_count)

def calculate_pace_decline_rate():
    tweet_count, days_to_next_friday = get_pace_params()
    T = 7
    decline_per_day = -(tweet_count * T) / ((T - days_to_next_friday) ** 2)
    decline_per_hour = decline_per_day / 24
    return round(decline_per_hour, 2)

def calculate_pace_for_increment(increment, hours, tweet_count, days_to_next_friday, current_pace):
    future_days = days_to_next_friday - (hours / 24)
    new_tweet_count = tweet_count + increment
    if future_days <= 0:
        return round(new_tweet_count, 2)
    new_pace = (new_tweet_count / (7 - future_days)) * future_days + new_tweet_count
    return round(max(new_pace, new_tweet_count), 2)

def calculate_pace_increase_in_hour(increment_value, hour_value):
    tweet_count, days_to_next_friday = get_pace_params()
    current_pace = (tweet_count / (7 - days_to_next_friday)) * days_to_next_friday + tweet_count
    increments = [0, 1, 5, 10, 20]
    pace_increases = {}
    for inc in increments:
        pace_increases[f'increase_{inc}'] = calculate_pace_for_increment(
            inc, 1, tweet_count, days_to_next_friday, current_pace
        )
    if increment_value is None or hour_value is None:
        pace_increases['custom_increment'] = None
    else:
        increment = int(increment_value)
        hours = int(hour_value)
        pace_increases['custom_increment'] = calculate_pace_for_increment(
            increment, hours, tweet_count, days_to_next_friday, current_pace
        )
        pace_increases['custom_increment_key'] = increment
    return pace_increases

def calculate_avg_tweets_per_day(target, now, remain):
    Xi = get_hourly_weighted_array()
    if remain <= 0:
        return "remain<=0"
    if target <= now:
        return "Already reach"

    fx = max(remain - 12, 0)

    if remain > 12:
        fy = sum(Xi[0:12]) * 24
    else:
        full_hours = int(remain)
        fractional_hour = remain - full_hours
        if full_hours >= 24:
            full_hours = 23
            fractional_hour = 0

        if full_hours < 0:
            full_hours = 0

        if full_hours > 0:
            fy = sum(Xi[0:full_hours]) + Xi[full_hours] * fractional_hour
        else:
            fy = Xi[0] * fractional_hour
        fy *= 24

    if fx + fy == 0:
        return "fx + fy = 0"

    result = (target - now) / ((fx + fy) / 24)
    return round(result, 2)

def calculate_tweet_probability(tweet_count, days_to_next_friday, prob_start, prob_end, peak_percentile=75):
    remaining_hours = days_to_next_friday * 24
    hourly_weights = get_dynamic_hourly_weights()

    data = get_last_7_days_data()
    if data.empty:
        recent_tweets = [70] * 7
    else:
        agg_data = aggregate_data(data, 1440)
        daily_tweets = agg_data.groupby('date')['tweet_count'].sum().reset_index()
        recent_tweets = daily_tweets['tweet_count'].tolist()[-7:]
        if len(recent_tweets) < 7:
            recent_tweets = recent_tweets + [70] * (7 - len(recent_tweets))

    recent_3_days = np.mean(recent_tweets[-3:])
    past_4_days = np.mean(recent_tweets[:-3]) if len(recent_tweets) > 3 else 70
    daily_avg = 0.8 * recent_3_days + 0.2 * past_4_days
    daily_avg_std = np.std(recent_tweets) if len(recent_tweets) >= 7 else np.std([70] * 7)

    peak_threshold = np.percentile(hourly_weights, peak_percentile)
    segments = []
    current_segment = []
    for i in range(24):
        if hourly_weights[i] >= peak_threshold:
            current_segment.append(i)
        elif current_segment:
            segments.append(current_segment)
            current_segment = []
    if current_segment:
        segments.append(current_segment)

    lambda_remaining = 0
    variance_factor = calculate_variance_factor()
    total_weight = sum(hourly_weights)
    for segment in segments:
        hours_in_segment = len(segment) * (remaining_hours / 24)
        segment_weight_avg = np.mean([hourly_weights[i] for i in segment])
        lambda_segment = daily_avg * (hours_in_segment / remaining_hours) * (segment_weight_avg / (total_weight / 24))
        lambda_remaining += lambda_segment

    mu_low = (daily_avg - daily_avg_std) * (remaining_hours / 24)
    mu_high = (daily_avg + daily_avg_std) * (remaining_hours / 24)
    var_low = mu_low * variance_factor
    var_high = mu_high * variance_factor
    sigma_low = np.sqrt(var_low)
    sigma_high = np.sqrt(var_high)

    a = prob_start - tweet_count
    b = prob_end - tweet_count

    if tweet_count > prob_end:
        return "0.0000 - 0.0000"

    if a < 0:
        a = 0
    if b < 0:
        return "0.0000 - 0.0000"

    prob_low = norm.cdf((b - mu_low) / sigma_low) - norm.cdf((a - mu_low) / sigma_low)
    prob_high = norm.cdf((b - mu_high) / sigma_high) - norm.cdf((a - mu_high) / sigma_high)

    prob_low = max(0.0, min(1.0, prob_low))
    prob_high = max(0.0, min(1.0, prob_high))

    prob_min = min(prob_low, prob_high)
    prob_max = max(prob_low, prob_high)
    return f"{prob_min:.4f} - {prob_max:.4f}"