+probability(debug&test)

This commit is contained in:
NY 2025-03-14 15:55:44 +08:00
parent 192bd42d0e
commit 8a2614b073
2 changed files with 89 additions and 74 deletions

View File

@ -247,13 +247,20 @@ def layout_config(app):
html.Tr([ html.Tr([
html.Td("Test Time:", style={'paddingRight': '10px'}), html.Td("Test Time:", style={'paddingRight': '10px'}),
html.Td( html.Td(
dcc.Input( html.Div([
id='test-time-input', dcc.Input(
type='text', id='test-time-input',
placeholder='HH:MM:SS (e.g., 12:00:00)', type='text',
value='12:00:00', placeholder='HH:MM:SS (e.g., 12:00:00)', # 增强提示
style={'width': '100%'} value='12:00:00',
) pattern='[0-2][0-9]:[0-5][0-9]:[0-5][0-9]', # 限制格式
style={'width': '100%'}
),
html.Span(
"Enter time in HH:MM:SS format (e.g., 12:00:00)",
style={'fontSize': '12px', 'color': 'gray', 'marginTop': '5px', 'display': 'block'}
)
])
) )
]), ]),
html.Tr([ html.Tr([
@ -267,7 +274,7 @@ def layout_config(app):
'width': '50%', 'width': '50%',
'marginTop': '10px', 'marginTop': '10px',
'borderCollapse': 'collapse' 'borderCollapse': 'collapse'
}) }),
], style={'marginLeft': '50px'}), ], style={'marginLeft': '50px'}),
dcc.Interval(id='clock-interval', interval=1000, n_intervals=0) dcc.Interval(id='clock-interval', interval=1000, n_intervals=0)

View File

@ -3,6 +3,7 @@ from pkg.dash.app_init import app
from dash.dependencies import Input, Output from dash.dependencies import Input, Output
from dash import html from dash import html
import pandas as pd import pandas as pd
import re
from datetime import timedelta from datetime import timedelta
@app.callback( @app.callback(
@ -16,91 +17,98 @@ def update_test_info(n_clicks, test_date, test_time):
return [html.Div("Click 'Test' to see historical probability results.")] return [html.Div("Click 'Test' to see historical probability results.")]
est = pytz.timezone('US/Eastern') est = pytz.timezone('US/Eastern')
data = render_data.global_agg_df.copy()
# 调试:打印输入值
print(f"test_date: {test_date}, test_time: {test_time}")
# 检查输入是否为空
if not test_date or not test_time:
return [html.Div("Date or time input is empty. Please provide both date (YYYY-MM-DD) and time (HH:MM:SS).")]
# 验证时间格式
time_pattern = r'^(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d$' # HH:MM:SS (00:00:00 to 23:59:59)
if not re.match(time_pattern, test_time):
return [html.Div("Invalid time format. Use HH:MM:SS (e.g., 12:00:00) with hours 00-23, minutes 00-59, seconds 00-59.")]
# 重构 datetime_est处理夏令时模糊时间
data['hours'] = data['minute_of_day'] // 60
data['minutes'] = data['minute_of_day'] % 60
data['datetime_est'] = pd.to_datetime(
data['date'].astype(str) + ' ' +
data['hours'].astype(str) + ':' +
data['minutes'].astype(str) + ':00',
errors='coerce'
).dt.tz_localize(est, ambiguous='NaT')
if data['datetime_est'].isna().any():
print("Warning: Some datetime_est values are NaT due to ambiguous time handling")
# 解析测试日期和时间 # 解析测试日期和时间
try: try:
test_date = pd.to_datetime(test_date).date() test_date = pd.to_datetime(test_date, format='%Y-%m-%d').date()
test_datetime = pd.to_datetime(f"{test_date} {test_time}").tz_localize(est) # 使用 est test_datetime = pd.to_datetime(f"{test_date} {test_time}", format='%Y-%m-%d %H:%M:%S').tz_localize(est, ambiguous=True)
except ValueError: except ValueError as e:
print(f"Error parsing date/time: {e}")
return [html.Div("Invalid date or time format. Use YYYY-MM-DD and HH:MM:SS (e.g., 12:00:00).")] return [html.Div("Invalid date or time format. Use YYYY-MM-DD and HH:MM:SS (e.g., 12:00:00).")]
# 1. 计算到 test_datetime 的累计推文数(模拟当时的 tweet_count # 计算周期开始时间(上一个周五 12:00 PM
data = render_data.global_agg_df.copy() test_date_only = test_datetime.replace(hour=0, minute=0, second=0, microsecond=0) # 只考虑日期部分
historical_data = data[data['datetime_est'] <= test_datetime] days_to_last_friday = (test_date_only.weekday() - 4) % 7 # 4 表示周五
if historical_data.empty: cycle_start = test_date_only - timedelta(days=days_to_last_friday)
return [html.Div(f"No data available up to {test_datetime}")] cycle_start = cycle_start.replace(hour=12, minute=0, second=0, microsecond=0) # 已经是 tz-aware直接调整时间
tweet_count = historical_data['tweet_count'].sum()
# 2. 计算实际最终推文数(到当天结束时的总数) # 确保周期结束时间(下周五 12:00 PM EDT考虑夏令时
day_end = pd.to_datetime(f"{test_date} 23:59:59").tz_localize(est) # 使用 est cycle_end = cycle_start + timedelta(days=7)
actual_data = data[(data['date'] == test_date) & (data['datetime_est'] <= day_end)] if cycle_end.month == 3 and 8 <= cycle_end.day <= 14: # 粗略检查夏令时开始3月第二个星期日
cycle_end = cycle_end.tz_convert(est) # 转换为 EDT
else:
cycle_end = cycle_end.tz_convert(est) # 保持一致
# 调试:打印周期信息
print(f"Cycle Start: {cycle_start}, Cycle End: {cycle_end}")
# 过滤周期内的数据
cycle_data = data[(data['datetime_est'] >= cycle_start) & (data['datetime_est'] <= test_datetime)]
if cycle_data.empty:
return [html.Div(f"No data available in cycle from {cycle_start} to {test_datetime}")]
tweet_count = cycle_data['tweet_count'].sum()
# 计算实际最终推文数(周期结束时的总数)
actual_data = data[(data['datetime_est'] >= cycle_start) & (data['datetime_est'] <= cycle_end)]
if actual_data.empty: if actual_data.empty:
return [html.Div(f"No data available for {test_date}")] return [html.Div(f"No data available for cycle ending {cycle_end}")]
actual_end_count = actual_data['tweet_count'].sum() actual_end_count = actual_data['tweet_count'].sum()
# 3. 模拟 days_to_next_friday从 test_datetime 到下周五) # 计算 days_to_next_friday从 test_datetime 到周期结束)
days_to_next_friday = (4 - test_date.weekday()) % 7 days_to_next_friday = (cycle_end - test_datetime).total_seconds() / (24 * 60 * 60)
next_friday = (test_datetime.replace(hour=12, minute=0, second=0, microsecond=0) +
timedelta(days=days_to_next_friday))
if test_datetime > next_friday:
next_friday += timedelta(days=7)
days_to_next_friday = (next_friday - test_datetime).total_seconds() / (24 * 60 * 60)
# 4. 设置预测范围(基于实际最终推文数的 ±10% # 设置预测范围
prob_start = actual_end_count * 0.9 # 90% of actual prob_start = actual_end_count * 0.9
prob_end = actual_end_count * 1.1 # 110% of actual prob_end = actual_end_count * 1.1
# 5. 调用原始的 calculate_tweet_probability() 计算概率 # 计算概率
probability = calculate_tweet_probability(tweet_count, days_to_next_friday, prob_start, prob_end) probability = calculate_tweet_probability(tweet_count, days_to_next_friday, prob_start, prob_end)
prob_min, prob_max = map(float, probability.split(" - ")) prob_min, prob_max = map(float, probability.split(" - "))
formatted_probability = f"{prob_min * 100:.2f}% - {prob_max * 100:.2f}%" formatted_probability = f"{prob_min * 100:.2f}% - {prob_max * 100:.2f}%"
# 6. 构建测试结果表格 # 构建测试结果表格
test_table_rows = [ test_table_rows = [
html.Tr([ html.Tr([html.Th("Test Date and Time:", colSpan=2), html.Td(str(test_datetime), colSpan=6)]),
html.Th("Test Date and Time:", colSpan=2, style={'paddingRight': '10px'}), html.Tr([html.Th("Tweet Count at Test Time:", colSpan=2), html.Td(str(tweet_count), colSpan=6)]),
html.Td(str(test_datetime), colSpan=6, style={'paddingRight': '10px'}) html.Tr([html.Th("Actual Final Tweet Count:", colSpan=2), html.Td(str(actual_end_count), colSpan=6)]),
]), html.Tr([html.Th(f"Predicted Range ({int(prob_start)}-{int(prob_end)}):", colSpan=2), html.Td(formatted_probability, colSpan=6)]),
html.Tr([ html.Tr([html.Th("Does Actual Fall in Range?", colSpan=2),
html.Th("Tweet Count at Test Time:", colSpan=2, style={'paddingRight': '10px'}), html.Td("Yes" if prob_start <= actual_end_count <= prob_end else "No",
html.Td(str(tweet_count), colSpan=6, style={'paddingRight': '10px'}) colSpan=6, style={'color': 'green' if prob_start <= actual_end_count <= prob_end else 'red'})])
]),
html.Tr([
html.Th("Actual Final Tweet Count:", colSpan=2, style={'paddingRight': '10px'}),
html.Td(str(actual_end_count), colSpan=6, style={'paddingRight': '10px'})
]),
html.Tr([
html.Th(f"Predicted Range ({int(prob_start)}-{int(prob_end)}):", colSpan=2, style={'paddingRight': '10px'}),
html.Td(formatted_probability, colSpan=6, style={'paddingRight': '10px'})
]),
html.Tr([
html.Th("Does Actual Fall in Range?", colSpan=2, style={'paddingRight': '10px'}),
html.Td(
"Yes" if prob_start <= actual_end_count <= prob_end else "No",
colSpan=6,
style={'paddingRight': '10px', 'color': 'green' if prob_start <= actual_end_count <= prob_end else 'red'}
)
])
] ]
if prob_start <= actual_end_count <= prob_end: if prob_start <= actual_end_count <= prob_end:
expected_prob = (prob_max + prob_min) / 2 expected_prob = (prob_max + prob_min) / 2
test_table_rows.append( test_table_rows.append(html.Tr([html.Th("Expected Probability:", colSpan=2),
html.Tr([ html.Td(f"~{expected_prob * 100:.2f}% (should be high if model fits)", colSpan=6)]))
html.Th("Expected Probability:", colSpan=2, style={'paddingRight': '10px'}),
html.Td(f"~{expected_prob * 100:.2f}% (should be high if model fits)", colSpan=6, style={'paddingRight': '10px'})
])
)
else: else:
test_table_rows.append( test_table_rows.append(html.Tr([html.Th("Note:", colSpan=2),
html.Tr([ html.Td("Model prediction does not match actual outcome.", colSpan=6, style={'color': 'red'})]))
html.Th("Note:", colSpan=2, style={'paddingRight': '10px'}),
html.Td("Model prediction does not match actual outcome.", colSpan=6, style={'paddingRight': '10px', 'color': 'red'})
])
)
test_table = html.Table(test_table_rows, style={ test_table = html.Table(test_table_rows, style={'width': '100%', 'textAlign': 'left', 'borderCollapse': 'collapse'})
'width': '100%',
'textAlign': 'left',
'borderCollapse': 'collapse'
})
return [test_table] return [test_table]