IELTS Training Analysis¶
This notebook analyzes my training process on IELTS over time, including:
- Score trends
- Band score distribution
- Time management analysis
- Recent performance comparison
P.S. I use 趴趴模考中心一站式备考服务 to practice IELTS.
In [1]:
Copied!
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import numpy as np
from datetime import datetime
# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12
# Load and process data
def load_test_data(section, data):
"""Load test data for a specific section (Listening/Reading)"""
return pd.DataFrame([{
'test': test,
'date': datetime.strptime(details['date'], '%Y-%m-%d %H:%M:%S'),
'score': details['score'],
'band': details['band'],
'time': details['time']
} for test, details in data[section].items()])
# Read YAML file
with open('train_log.yml', 'r') as file:
data = yaml.safe_load(file)
# Create and sort DataFrames
listening_df = load_test_data('Listening', data).sort_values('date', ascending=False)
reading_df = load_test_data('Reading', data).sort_values('date', ascending=False)
# Add test numbers (most recent = 1)
listening_df['test_number'] = range(1, len(listening_df) + 1)
reading_df['test_number'] = range(1, len(reading_df) + 1)
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import numpy as np
from datetime import datetime
# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12
# Load and process data
def load_test_data(section, data):
"""Load test data for a specific section (Listening/Reading)"""
return pd.DataFrame([{
'test': test,
'date': datetime.strptime(details['date'], '%Y-%m-%d %H:%M:%S'),
'score': details['score'],
'band': details['band'],
'time': details['time']
} for test, details in data[section].items()])
# Read YAML file
with open('train_log.yml', 'r') as file:
data = yaml.safe_load(file)
# Create and sort DataFrames
listening_df = load_test_data('Listening', data).sort_values('date', ascending=False)
reading_df = load_test_data('Reading', data).sort_values('date', ascending=False)
# Add test numbers (most recent = 1)
listening_df['test_number'] = range(1, len(listening_df) + 1)
reading_df['test_number'] = range(1, len(reading_df) + 1)
Overall Performance Analysis¶
The following plots show:
- Score trends over time
- Band score distribution
- Time management trends for both sections
In [2]:
Copied!
from scipy import stats
# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
# 1. Score trends
ax = axes[0, 0]
ax.plot(listening_df['test_number'], listening_df['score'], marker='o', label='Listening', linewidth=2)
ax.plot(reading_df['test_number'], reading_df['score'], marker='o', label='Reading', linewidth=2)
# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df['test_number'], listening_df['score'], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df['test_number'], listening_df['score'])[0,1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df['test_number'], listening_df['score'])
line1 = ax.plot(listening_df['test_number'], p1_func(listening_df['test_number']),
'--', color='blue', alpha=0.5)
ax.annotate(f'R²={r1_2:.3f}, p={pval1:.3f}',
xy=(listening_df['test_number'].mean(), p1_func(listening_df['test_number'].mean())),
xytext=(10, 10), textcoords='offset points',
color='blue', alpha=0.7)
z2 = np.polyfit(reading_df['test_number'], reading_df['score'], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df['test_number'], reading_df['score'])[0,1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df['test_number'], reading_df['score'])
line2 = ax.plot(reading_df['test_number'], p2_func(reading_df['test_number']),
'--', color='orange', alpha=0.5)
ax.annotate(f'R²={r2_2:.3f}, p={pval2:.3f}',
xy=(reading_df['test_number'].mean(), p2_func(reading_df['test_number'].mean())),
xytext=(10, -10), textcoords='offset points',
color='orange', alpha=0.7)
ax.set_title('IELTS Score Trends', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Score', fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()
# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot([listening_df['band'], reading_df['band']],
tick_labels=['Listening', 'Reading'],
patch_artist=True)
for box in bp['boxes']:
box.set(facecolor='lightblue', alpha=0.7)
ax.set_title('Band Score Distribution', fontsize=16, pad=20)
ax.set_ylabel('Band Score', fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])
# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df['test_number'], listening_df['time'], marker='o', color='green', linewidth=2)
ax.set_title('Listening Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()
# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df['test_number'], reading_df['time'], marker='o', color='purple', linewidth=2)
ax.set_title('Reading Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Most Recent → Earliest)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()
plt.tight_layout()
plt.show()
def check_significance(r_squared, p_value, alpha=0.05):
significance = "显著" if p_value < alpha else "不显著"
return f"决定系数为 {r_squared:.3f},表示训练次数可以解释分数约 {r_squared*100:.1f}% 的变异。" \
f"回归系数的 p 值为 {p_value:.3f}," \
f"说明训练次数对分数的影响在统计上{significance}" \
f"(以 alpha = {alpha} 为阈值)。"
print(f"阅读分数的回归模型:{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型:{check_significance(r1_2, pval1)}")
from scipy import stats
# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
# 1. Score trends
ax = axes[0, 0]
ax.plot(listening_df['test_number'], listening_df['score'], marker='o', label='Listening', linewidth=2)
ax.plot(reading_df['test_number'], reading_df['score'], marker='o', label='Reading', linewidth=2)
# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df['test_number'], listening_df['score'], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df['test_number'], listening_df['score'])[0,1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df['test_number'], listening_df['score'])
line1 = ax.plot(listening_df['test_number'], p1_func(listening_df['test_number']),
'--', color='blue', alpha=0.5)
ax.annotate(f'R²={r1_2:.3f}, p={pval1:.3f}',
xy=(listening_df['test_number'].mean(), p1_func(listening_df['test_number'].mean())),
xytext=(10, 10), textcoords='offset points',
color='blue', alpha=0.7)
z2 = np.polyfit(reading_df['test_number'], reading_df['score'], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df['test_number'], reading_df['score'])[0,1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df['test_number'], reading_df['score'])
line2 = ax.plot(reading_df['test_number'], p2_func(reading_df['test_number']),
'--', color='orange', alpha=0.5)
ax.annotate(f'R²={r2_2:.3f}, p={pval2:.3f}',
xy=(reading_df['test_number'].mean(), p2_func(reading_df['test_number'].mean())),
xytext=(10, -10), textcoords='offset points',
color='orange', alpha=0.7)
ax.set_title('IELTS Score Trends', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Score', fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()
# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot([listening_df['band'], reading_df['band']],
tick_labels=['Listening', 'Reading'],
patch_artist=True)
for box in bp['boxes']:
box.set(facecolor='lightblue', alpha=0.7)
ax.set_title('Band Score Distribution', fontsize=16, pad=20)
ax.set_ylabel('Band Score', fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])
# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df['test_number'], listening_df['time'], marker='o', color='green', linewidth=2)
ax.set_title('Listening Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()
# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df['test_number'], reading_df['time'], marker='o', color='purple', linewidth=2)
ax.set_title('Reading Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Most Recent → Earliest)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()
plt.tight_layout()
plt.show()
def check_significance(r_squared, p_value, alpha=0.05):
significance = "显著" if p_value < alpha else "不显著"
return f"决定系数为 {r_squared:.3f},表示训练次数可以解释分数约 {r_squared*100:.1f}% 的变异。" \
f"回归系数的 p 值为 {p_value:.3f}," \
f"说明训练次数对分数的影响在统计上{significance}" \
f"(以 alpha = {alpha} 为阈值)。"
print(f"阅读分数的回归模型:{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型:{check_significance(r1_2, pval1)}")
阅读分数的回归模型:决定系数为 0.165,表示训练次数可以解释分数约 16.5% 的变异。回归系数的 p 值为 0.075,说明训练次数对分数的影响在统计上不显著(以 alpha = 0.05 为阈值)。 听力分数的回归模型:决定系数为 0.405,表示训练次数可以解释分数约 40.5% 的变异。回归系数的 p 值为 0.000,说明训练次数对分数的影响在统计上显著(以 alpha = 0.05 为阈值)。
Training Frequency Statistics¶
show the training frequency of my training on IELTS
In [3]:
Copied!
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
import calendar
# 创建训练数据(与你已有代码一样)
listening_dates = pd.to_datetime(listening_df['date']).dt.normalize()
reading_dates = pd.to_datetime(reading_df['date']).dt.normalize()
start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
training_data = pd.Series(0, index=date_range)
for date in listening_dates:
training_data[date] += 1
for date in reading_dates:
training_data[date] += 2
# 颜色设置
colors = {
0: '#f5f5f5',
1: '#9ecae1',
2: '#fc9272',
3: '#807dba'
}
cmap = ListedColormap([colors[i] for i in range(4)])
# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period('M').unique()
# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols) # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()
for i, period in enumerate(nonzero_months):
ax = axes[i]
month_data = training_data[training_data.index.to_period('M') == period]
# 生成月日历矩阵
month_dates = month_data.index
days = month_dates.day
weekdays = month_dates.weekday
week_of_month = ((days - 1 + month_dates[0].weekday()) // 7)
# 画每个格子
for date, value in month_data.items():
x = date.weekday() # 0=Mon
y = ((date.day + date.replace(day=1).weekday() - 1) // 7)
ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))
# 样式
ax.set_xlim(0, 7)
ax.set_ylim(0, 6)
ax.set_xticks(range(7))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax.set_yticks([])
ax.set_title(f'{period.strftime("%B %Y")}')
ax.set_aspect('equal')
ax.invert_yaxis()
# 移除多余子图
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# 添加图例
legend_elements = [
Patch(facecolor=colors[1], label='Listening'),
Patch(facecolor=colors[2], label='Reading'),
Patch(facecolor=colors[3], label='Both')
]
fig.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
import calendar
# 创建训练数据(与你已有代码一样)
listening_dates = pd.to_datetime(listening_df['date']).dt.normalize()
reading_dates = pd.to_datetime(reading_df['date']).dt.normalize()
start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
training_data = pd.Series(0, index=date_range)
for date in listening_dates:
training_data[date] += 1
for date in reading_dates:
training_data[date] += 2
# 颜色设置
colors = {
0: '#f5f5f5',
1: '#9ecae1',
2: '#fc9272',
3: '#807dba'
}
cmap = ListedColormap([colors[i] for i in range(4)])
# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period('M').unique()
# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols) # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()
for i, period in enumerate(nonzero_months):
ax = axes[i]
month_data = training_data[training_data.index.to_period('M') == period]
# 生成月日历矩阵
month_dates = month_data.index
days = month_dates.day
weekdays = month_dates.weekday
week_of_month = ((days - 1 + month_dates[0].weekday()) // 7)
# 画每个格子
for date, value in month_data.items():
x = date.weekday() # 0=Mon
y = ((date.day + date.replace(day=1).weekday() - 1) // 7)
ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))
# 样式
ax.set_xlim(0, 7)
ax.set_ylim(0, 6)
ax.set_xticks(range(7))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax.set_yticks([])
ax.set_title(f'{period.strftime("%B %Y")}')
ax.set_aspect('equal')
ax.invert_yaxis()
# 移除多余子图
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# 添加图例
legend_elements = [
Patch(facecolor=colors[1], label='Listening'),
Patch(facecolor=colors[2], label='Reading'),
Patch(facecolor=colors[3], label='Both')
]
fig.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
Recent Performance Analysis¶
Detailed comparison of the most recent 7 tests, showing:
- Direct score comparison between Listening and Reading
- Band scores for each test
- Clear visualization of recent trends
In [4]:
Copied!
# Create recent performance comparison
plt.figure(figsize=(15, 8))
# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)
# Set up bar positions
x = np.arange(7)
width = 0.35
# Create bars
plt.bar(x - width/2, recent_listening['score'], width, label='Listening', color='skyblue', alpha=0.7)
plt.bar(x + width/2, recent_reading['score'], width, label='Reading', color='lightcoral', alpha=0.7)
# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening['score'], recent_listening['band'])):
plt.text(i - width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
for i, (score, band) in enumerate(zip(recent_reading['score'], recent_reading['band'])):
plt.text(i + width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
# Configure plot
plt.title('Recent Test Performance', fontsize=16, pad=20)
plt.xlabel('Test Number (Most Recent → Earliest)', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
# Create recent performance comparison
plt.figure(figsize=(15, 8))
# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)
# Set up bar positions
x = np.arange(7)
width = 0.35
# Create bars
plt.bar(x - width/2, recent_listening['score'], width, label='Listening', color='skyblue', alpha=0.7)
plt.bar(x + width/2, recent_reading['score'], width, label='Reading', color='lightcoral', alpha=0.7)
# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening['score'], recent_listening['band'])):
plt.text(i - width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
for i, (score, band) in enumerate(zip(recent_reading['score'], recent_reading['band'])):
plt.text(i + width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
# Configure plot
plt.title('Recent Test Performance', fontsize=16, pad=20)
plt.xlabel('Test Number (Most Recent → Earliest)', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
Statistical Summary¶
Key statistics for both Listening and Reading sections:
In [5]:
Copied!
# Calculate and display statistics
def print_section_stats(df, section):
"""Print statistics for a test section"""
print(f"{section} Statistics:")
print("-" * 50)
print(f"Average Score: {df['score'].mean():.2f}")
print(f"Average Band: {df['band'].mean():.2f}")
print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")
print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")
# Calculate and display statistics
def print_section_stats(df, section):
"""Print statistics for a test section"""
print(f"{section} Statistics:")
print("-" * 50)
print(f"Average Score: {df['score'].mean():.2f}")
print(f"Average Band: {df['band'].mean():.2f}")
print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")
print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")
Listening Statistics: -------------------------------------------------- Average Score: 30.08 Average Band: 6.90 Highest Score: 37 (Band 8.5) Lowest Score: 25 (Band 6.0) Average Completion Time: 1620.27 seconds Reading Statistics: -------------------------------------------------- Average Score: 34.15 Average Band: 7.72 Highest Score: 38 (Band 8.5) Lowest Score: 28 (Band 6.5) Average Completion Time: 3210.55 seconds