IELTS Training Analysis¶

This notebook analyzes my training process on IELTS over time, including:

Score trends
Band score distribution
Time management analysis
Recent performance comparison

P.S. I use 趴趴模考中心一站式备考服务 to practice IELTS.

In [1]:

Copied!





# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import numpy as np
from datetime import datetime


# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12

# Load and process data
def load_test_data(section, data):
    """Load test data for a specific section (Listening/Reading)"""
    return pd.DataFrame([{
        'test': test,
        'date': datetime.strptime(details['date'], '%Y-%m-%d %H:%M:%S'),
        'score': details['score'],
        'band': details['band'],
        'time': details['time']
    } for test, details in data[section].items()])

# Read YAML file
with open('train_log.yml', 'r') as file:
    data = yaml.safe_load(file)

# Create and sort DataFrames
listening_df = load_test_data('Listening', data).sort_values('date', ascending=False)
reading_df = load_test_data('Reading', data).sort_values('date', ascending=False)

# Add test numbers (most recent = 1)
listening_df['test_number'] = range(1, len(listening_df) + 1)
reading_df['test_number'] = range(1, len(reading_df) + 1)
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import numpy as np
from datetime import datetime


# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12

# Load and process data
def load_test_data(section, data):
    """Load test data for a specific section (Listening/Reading)"""
    return pd.DataFrame([{
        'test': test,
        'date': datetime.strptime(details['date'], '%Y-%m-%d %H:%M:%S'),
        'score': details['score'],
        'band': details['band'],
        'time': details['time']
    } for test, details in data[section].items()])

# Read YAML file
with open('train_log.yml', 'r') as file:
    data = yaml.safe_load(file)

# Create and sort DataFrames
listening_df = load_test_data('Listening', data).sort_values('date', ascending=False)
reading_df = load_test_data('Reading', data).sort_values('date', ascending=False)

# Add test numbers (most recent = 1)
listening_df['test_number'] = range(1, len(listening_df) + 1)
reading_df['test_number'] = range(1, len(reading_df) + 1)

Overall Performance Analysis¶

The following plots show:

Score trends over time
Band score distribution
Time management trends for both sections

In [2]:

Copied!





from scipy import stats
# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Score trends
ax = axes[0, 0]
ax.plot(listening_df['test_number'], listening_df['score'], marker='o', label='Listening', linewidth=2)
ax.plot(reading_df['test_number'], reading_df['score'], marker='o', label='Reading', linewidth=2)

# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df['test_number'], listening_df['score'], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df['test_number'], listening_df['score'])[0,1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df['test_number'], listening_df['score'])
line1 = ax.plot(listening_df['test_number'], p1_func(listening_df['test_number']), 
        '--', color='blue', alpha=0.5)
ax.annotate(f'R²={r1_2:.3f}, p={pval1:.3f}', 
           xy=(listening_df['test_number'].mean(), p1_func(listening_df['test_number'].mean())),
           xytext=(10, 10), textcoords='offset points',
           color='blue', alpha=0.7)

z2 = np.polyfit(reading_df['test_number'], reading_df['score'], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df['test_number'], reading_df['score'])[0,1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df['test_number'], reading_df['score'])
line2 = ax.plot(reading_df['test_number'], p2_func(reading_df['test_number']), 
        '--', color='orange', alpha=0.5)
ax.annotate(f'R²={r2_2:.3f}, p={pval2:.3f}',
           xy=(reading_df['test_number'].mean(), p2_func(reading_df['test_number'].mean())),
           xytext=(10, -10), textcoords='offset points', 
           color='orange', alpha=0.7)

ax.set_title('IELTS Score Trends', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Score', fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()

# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot([listening_df['band'], reading_df['band']], 
                tick_labels=['Listening', 'Reading'], 
                patch_artist=True)
for box in bp['boxes']:
    box.set(facecolor='lightblue', alpha=0.7)
ax.set_title('Band Score Distribution', fontsize=16, pad=20)
ax.set_ylabel('Band Score', fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])

# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df['test_number'], listening_df['time'], marker='o', color='green', linewidth=2)
ax.set_title('Listening Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df['test_number'], reading_df['time'], marker='o', color='purple', linewidth=2)
ax.set_title('Reading Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Most Recent → Earliest)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

plt.tight_layout()
plt.show()

def check_significance(r_squared, p_value, alpha=0.05):
    significance = "显著" if p_value < alpha else "不显著"
    return f"决定系数为 {r_squared:.3f}，表示训练次数可以解释分数约 {r_squared*100:.1f}% 的变异。" \
           f"回归系数的 p 值为 {p_value:.3f}，" \
           f"说明训练次数对分数的影响在统计上{significance}" \
           f"（以 alpha = {alpha} 为阈值）。"

print(f"阅读分数的回归模型：{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型：{check_significance(r1_2, pval1)}")
from scipy import stats
# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Score trends
ax = axes[0, 0]
ax.plot(listening_df['test_number'], listening_df['score'], marker='o', label='Listening', linewidth=2)
ax.plot(reading_df['test_number'], reading_df['score'], marker='o', label='Reading', linewidth=2)

# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df['test_number'], listening_df['score'], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df['test_number'], listening_df['score'])[0,1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df['test_number'], listening_df['score'])
line1 = ax.plot(listening_df['test_number'], p1_func(listening_df['test_number']), 
        '--', color='blue', alpha=0.5)
ax.annotate(f'R²={r1_2:.3f}, p={pval1:.3f}', 
           xy=(listening_df['test_number'].mean(), p1_func(listening_df['test_number'].mean())),
           xytext=(10, 10), textcoords='offset points',
           color='blue', alpha=0.7)

z2 = np.polyfit(reading_df['test_number'], reading_df['score'], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df['test_number'], reading_df['score'])[0,1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df['test_number'], reading_df['score'])
line2 = ax.plot(reading_df['test_number'], p2_func(reading_df['test_number']), 
        '--', color='orange', alpha=0.5)
ax.annotate(f'R²={r2_2:.3f}, p={pval2:.3f}',
           xy=(reading_df['test_number'].mean(), p2_func(reading_df['test_number'].mean())),
           xytext=(10, -10), textcoords='offset points', 
           color='orange', alpha=0.7)

ax.set_title('IELTS Score Trends', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Score', fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()

# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot([listening_df['band'], reading_df['band']], 
                tick_labels=['Listening', 'Reading'], 
                patch_artist=True)
for box in bp['boxes']:
    box.set(facecolor='lightblue', alpha=0.7)
ax.set_title('Band Score Distribution', fontsize=16, pad=20)
ax.set_ylabel('Band Score', fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])

# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df['test_number'], listening_df['time'], marker='o', color='green', linewidth=2)
ax.set_title('Listening Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Earliest → Most Recent)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df['test_number'], reading_df['time'], marker='o', color='purple', linewidth=2)
ax.set_title('Reading Completion Time', fontsize=16, pad=20)
ax.set_xlabel('Test Number (Most Recent → Earliest)', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

plt.tight_layout()
plt.show()

def check_significance(r_squared, p_value, alpha=0.05):
    significance = "显著" if p_value < alpha else "不显著"
    return f"决定系数为 {r_squared:.3f}，表示训练次数可以解释分数约 {r_squared*100:.1f}% 的变异。" \
           f"回归系数的 p 值为 {p_value:.3f}，" \
           f"说明训练次数对分数的影响在统计上{significance}" \
           f"（以 alpha = {alpha} 为阈值）。"

print(f"阅读分数的回归模型：{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型：{check_significance(r1_2, pval1)}")

No description has been provided for this image

阅读分数的回归模型：决定系数为 0.165，表示训练次数可以解释分数约 16.5% 的变异。回归系数的 p 值为 0.075，说明训练次数对分数的影响在统计上不显著（以 alpha = 0.05 为阈值）。
听力分数的回归模型：决定系数为 0.495，表示训练次数可以解释分数约 49.5% 的变异。回归系数的 p 值为 0.000，说明训练次数对分数的影响在统计上显著（以 alpha = 0.05 为阈值）。

Training Frequency Statistics¶

show the training frequency of my training on IELTS

In [3]:

Copied!





import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
import calendar

# 创建训练数据（与你已有代码一样）
listening_dates = pd.to_datetime(listening_df['date']).dt.normalize()
reading_dates = pd.to_datetime(reading_df['date']).dt.normalize()

start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

training_data = pd.Series(0, index=date_range)
for date in listening_dates:
    training_data[date] += 1
for date in reading_dates:
    training_data[date] += 2

# 颜色设置
colors = {
    0: '#f5f5f5',
    1: '#9ecae1',
    2: '#fc9272',
    3: '#807dba'
}
cmap = ListedColormap([colors[i] for i in range(4)])

# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period('M').unique()

# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols)  # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for i, period in enumerate(nonzero_months):
    ax = axes[i]
    month_data = training_data[training_data.index.to_period('M') == period]

    # 生成月日历矩阵
    month_dates = month_data.index
    days = month_dates.day
    weekdays = month_dates.weekday
    week_of_month = ((days - 1 + month_dates[0].weekday()) // 7)

    # 画每个格子
    for date, value in month_data.items():
        x = date.weekday()  # 0=Mon
        y = ((date.day + date.replace(day=1).weekday() - 1) // 7)
        ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))

    # 样式
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 6)
    ax.set_xticks(range(7))
    ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
    ax.set_yticks([])
    ax.set_title(f'{period.strftime("%B %Y")}')
    ax.set_aspect('equal')
    ax.invert_yaxis()

# 移除多余子图
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# 添加图例
legend_elements = [
    Patch(facecolor=colors[1], label='Listening'),
    Patch(facecolor=colors[2], label='Reading'),
    Patch(facecolor=colors[3], label='Both')
]
fig.legend(handles=legend_elements, loc='upper right')

plt.suptitle('IELTS Training Frequency Calendar', fontsize=16, y=0.98)
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
import calendar

# 创建训练数据（与你已有代码一样）
listening_dates = pd.to_datetime(listening_df['date']).dt.normalize()
reading_dates = pd.to_datetime(reading_df['date']).dt.normalize()

start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

training_data = pd.Series(0, index=date_range)
for date in listening_dates:
    training_data[date] += 1
for date in reading_dates:
    training_data[date] += 2

# 颜色设置
colors = {
    0: '#f5f5f5',
    1: '#9ecae1',
    2: '#fc9272',
    3: '#807dba'
}
cmap = ListedColormap([colors[i] for i in range(4)])

# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period('M').unique()

# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols)  # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for i, period in enumerate(nonzero_months):
    ax = axes[i]
    month_data = training_data[training_data.index.to_period('M') == period]

    # 生成月日历矩阵
    month_dates = month_data.index
    days = month_dates.day
    weekdays = month_dates.weekday
    week_of_month = ((days - 1 + month_dates[0].weekday()) // 7)

    # 画每个格子
    for date, value in month_data.items():
        x = date.weekday()  # 0=Mon
        y = ((date.day + date.replace(day=1).weekday() - 1) // 7)
        ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))

    # 样式
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 6)
    ax.set_xticks(range(7))
    ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
    ax.set_yticks([])
    ax.set_title(f'{period.strftime("%B %Y")}')
    ax.set_aspect('equal')
    ax.invert_yaxis()

# 移除多余子图
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# 添加图例
legend_elements = [
    Patch(facecolor=colors[1], label='Listening'),
    Patch(facecolor=colors[2], label='Reading'),
    Patch(facecolor=colors[3], label='Both')
]
fig.legend(handles=legend_elements, loc='upper right')

plt.suptitle('IELTS Training Frequency Calendar', fontsize=16, y=0.98)
plt.tight_layout()
plt.show()

Recent Performance Analysis¶

Detailed comparison of the most recent 7 tests, showing:

Direct score comparison between Listening and Reading
Band scores for each test
Clear visualization of recent trends

In [4]:

Copied!





# Create recent performance comparison
plt.figure(figsize=(15, 8))

# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)

# Set up bar positions
x = np.arange(7)
width = 0.35

# Create bars
plt.bar(x - width/2, recent_listening['score'], width, label='Listening', color='skyblue', alpha=0.7)
plt.bar(x + width/2, recent_reading['score'], width, label='Reading', color='lightcoral', alpha=0.7)

# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening['score'], recent_listening['band'])):
    plt.text(i - width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
for i, (score, band) in enumerate(zip(recent_reading['score'], recent_reading['band'])):
    plt.text(i + width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)

# Configure plot
plt.title('Recent IELTS Test Performance Comparison', fontsize=16, pad=20)
plt.xlabel('Test Number (Most Recent → Earliest)', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()
# Create recent performance comparison
plt.figure(figsize=(15, 8))

# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)

# Set up bar positions
x = np.arange(7)
width = 0.35

# Create bars
plt.bar(x - width/2, recent_listening['score'], width, label='Listening', color='skyblue', alpha=0.7)
plt.bar(x + width/2, recent_reading['score'], width, label='Reading', color='lightcoral', alpha=0.7)

# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening['score'], recent_listening['band'])):
    plt.text(i - width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)
for i, (score, band) in enumerate(zip(recent_reading['score'], recent_reading['band'])):
    plt.text(i + width/2, score + 0.5, f'{score}\n(B{band})', ha='center', va='bottom', fontsize=10)

# Configure plot
plt.title('Recent IELTS Test Performance Comparison', fontsize=16, pad=20)
plt.xlabel('Test Number (Most Recent → Earliest)', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

Statistical Summary¶

Key statistics for both Listening and Reading sections:

In [5]:

Copied!





# Calculate and display statistics
def print_section_stats(df, section):
    """Print statistics for a test section"""
    print(f"{section} Statistics:")
    print("-" * 50)
    print(f"Average Score: {df['score'].mean():.2f}")
    print(f"Average Band: {df['band'].mean():.2f}")
    print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
    print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
    print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")

print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")
# Calculate and display statistics
def print_section_stats(df, section):
    """Print statistics for a test section"""
    print(f"{section} Statistics:")
    print("-" * 50)
    print(f"Average Score: {df['score'].mean():.2f}")
    print(f"Average Band: {df['band'].mean():.2f}")
    print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
    print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
    print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")

print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")

Listening Statistics:
--------------------------------------------------
Average Score: 30.50
Average Band: 6.98
Highest Score: 37 (Band 8.5)
Lowest Score: 25 (Band 6.0)
Average Completion Time: 1629.04 seconds

Reading Statistics:
--------------------------------------------------
Average Score: 34.15
Average Band: 7.72
Highest Score: 38 (Band 8.5)
Lowest Score: 28 (Band 6.5)
Average Completion Time: 3210.55 seconds