IELTS Training Analysis¶

This notebook analyzes my training process on IELTS over time, including:

Score trends
Band score distribution
Time management analysis
Recent performance comparison

P.S. I use 趴趴模考中心一站式备考服务 to practice IELTS.

In [1]:

Copied!





# Import required libraries
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml

# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams["font.size"] = 12


# Load and process data
def load_test_data(section, data):
    """Load test data for a specific section (Listening/Reading)"""
    return pd.DataFrame(
        [
            {
                "test": test,
                "date": datetime.strptime(details["date"], "%Y-%m-%d %H:%M:%S"),
                "score": details["score"],
                "band": details["band"],
                "time": details["time"],
            }
            for test, details in data[section].items()
        ]
    )


# Read YAML file
with open("train_log.yml", "r") as file:
    data = yaml.safe_load(file)

# Create and sort DataFrames
listening_df = load_test_data("Listening", data).sort_values("date", ascending=False)
reading_df = load_test_data("Reading", data).sort_values("date", ascending=False)

# Add test numbers (most recent = 1)
listening_df["test_number"] = range(1, len(listening_df) + 1)
reading_df["test_number"] = range(1, len(reading_df) + 1)
# Import required libraries
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml

# Configure matplotlib settings
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams["font.size"] = 12


# Load and process data
def load_test_data(section, data):
    """Load test data for a specific section (Listening/Reading)"""
    return pd.DataFrame(
        [
            {
                "test": test,
                "date": datetime.strptime(details["date"], "%Y-%m-%d %H:%M:%S"),
                "score": details["score"],
                "band": details["band"],
                "time": details["time"],
            }
            for test, details in data[section].items()
        ]
    )


# Read YAML file
with open("train_log.yml", "r") as file:
    data = yaml.safe_load(file)

# Create and sort DataFrames
listening_df = load_test_data("Listening", data).sort_values("date", ascending=False)
reading_df = load_test_data("Reading", data).sort_values("date", ascending=False)

# Add test numbers (most recent = 1)
listening_df["test_number"] = range(1, len(listening_df) + 1)
reading_df["test_number"] = range(1, len(reading_df) + 1)

Overall Performance Analysis¶

The following plots show:

Score trends over time
Band score distribution
Time management trends for both sections

In [2]:

Copied!





from scipy import stats

# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Score trends
ax = axes[0, 0]
ax.plot(
    listening_df["test_number"], listening_df["score"], marker="o", label="Listening", linewidth=2
)
ax.plot(reading_df["test_number"], reading_df["score"], marker="o", label="Reading", linewidth=2)

# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df["test_number"], listening_df["score"], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df["test_number"], listening_df["score"])[0, 1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df["test_number"], listening_df["score"])
line1 = ax.plot(
    listening_df["test_number"], p1_func(listening_df["test_number"]), "--", color="blue", alpha=0.5
)
ax.annotate(
    f"R²={r1_2:.3f}, p={pval1:.3f}",
    xy=(listening_df["test_number"].mean(), p1_func(listening_df["test_number"].mean())),
    xytext=(10, 10),
    textcoords="offset points",
    color="blue",
    alpha=0.7,
)

z2 = np.polyfit(reading_df["test_number"], reading_df["score"], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df["test_number"], reading_df["score"])[0, 1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df["test_number"], reading_df["score"])
line2 = ax.plot(
    reading_df["test_number"], p2_func(reading_df["test_number"]), "--", color="orange", alpha=0.5
)
ax.annotate(
    f"R²={r2_2:.3f}, p={pval2:.3f}",
    xy=(reading_df["test_number"].mean(), p2_func(reading_df["test_number"].mean())),
    xytext=(10, -10),
    textcoords="offset points",
    color="orange",
    alpha=0.7,
)

ax.set_title("IELTS Score Trends", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Earliest → Most Recent)", fontsize=14)
ax.set_ylabel("Score", fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()

# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot(
    [listening_df["band"], reading_df["band"]],
    tick_labels=["Listening", "Reading"],
    patch_artist=True,
)
for box in bp["boxes"]:
    box.set(facecolor="lightblue", alpha=0.7)
ax.set_title("Band Score Distribution", fontsize=16, pad=20)
ax.set_ylabel("Band Score", fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])

# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df["test_number"], listening_df["time"], marker="o", color="green", linewidth=2)
ax.set_title("Listening Completion Time", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Earliest → Most Recent)", fontsize=14)
ax.set_ylabel("Time (seconds)", fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df["test_number"], reading_df["time"], marker="o", color="purple", linewidth=2)
ax.set_title("Reading Completion Time", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Most Recent → Earliest)", fontsize=14)
ax.set_ylabel("Time (seconds)", fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

plt.tight_layout()
plt.show()


def check_significance(r_squared, p_value, alpha=0.05):
    significance = "显著" if p_value < alpha else "不显著"
    return (
        f"决定系数为 {r_squared:.3f}，表示训练次数可以解释分数约 {r_squared * 100:.1f}% 的变异。"
        f"回归系数的 p 值为 {p_value:.3f}，"
        f"说明训练次数对分数的影响在统计上{significance}"
        f"（以 alpha = {alpha} 为阈值）。"
    )


print(f"阅读分数的回归模型：{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型：{check_significance(r1_2, pval1)}")
from scipy import stats

# Create main analysis plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Score trends
ax = axes[0, 0]
ax.plot(
    listening_df["test_number"], listening_df["score"], marker="o", label="Listening", linewidth=2
)
ax.plot(reading_df["test_number"], reading_df["score"], marker="o", label="Reading", linewidth=2)

# Add trend lines and calculate R² and p-value
z1 = np.polyfit(listening_df["test_number"], listening_df["score"], 1)
p1_func = np.poly1d(z1)
r1 = np.corrcoef(listening_df["test_number"], listening_df["score"])[0, 1]
r1_2 = r1**2
_, pval1 = stats.pearsonr(listening_df["test_number"], listening_df["score"])
line1 = ax.plot(
    listening_df["test_number"], p1_func(listening_df["test_number"]), "--", color="blue", alpha=0.5
)
ax.annotate(
    f"R²={r1_2:.3f}, p={pval1:.3f}",
    xy=(listening_df["test_number"].mean(), p1_func(listening_df["test_number"].mean())),
    xytext=(10, 10),
    textcoords="offset points",
    color="blue",
    alpha=0.7,
)

z2 = np.polyfit(reading_df["test_number"], reading_df["score"], 1)
p2_func = np.poly1d(z2)
r2 = np.corrcoef(reading_df["test_number"], reading_df["score"])[0, 1]
r2_2 = r2**2
_, pval2 = stats.pearsonr(reading_df["test_number"], reading_df["score"])
line2 = ax.plot(
    reading_df["test_number"], p2_func(reading_df["test_number"]), "--", color="orange", alpha=0.5
)
ax.annotate(
    f"R²={r2_2:.3f}, p={pval2:.3f}",
    xy=(reading_df["test_number"].mean(), p2_func(reading_df["test_number"].mean())),
    xytext=(10, -10),
    textcoords="offset points",
    color="orange",
    alpha=0.7,
)

ax.set_title("IELTS Score Trends", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Earliest → Most Recent)", fontsize=14)
ax.set_ylabel("Score", fontsize=14)
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_ylim([20, 40])
ax.invert_xaxis()

# 2. Band score distribution
ax = axes[0, 1]
bp = ax.boxplot(
    [listening_df["band"], reading_df["band"]],
    tick_labels=["Listening", "Reading"],
    patch_artist=True,
)
for box in bp["boxes"]:
    box.set(facecolor="lightblue", alpha=0.7)
ax.set_title("Band Score Distribution", fontsize=16, pad=20)
ax.set_ylabel("Band Score", fontsize=14)
ax.grid(True, alpha=0.3)
ax.set_ylim([5, 9])

# 3. Listening completion time
ax = axes[1, 0]
ax.plot(listening_df["test_number"], listening_df["time"], marker="o", color="green", linewidth=2)
ax.set_title("Listening Completion Time", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Earliest → Most Recent)", fontsize=14)
ax.set_ylabel("Time (seconds)", fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

# 4. Reading completion time
ax = axes[1, 1]
ax.plot(reading_df["test_number"], reading_df["time"], marker="o", color="purple", linewidth=2)
ax.set_title("Reading Completion Time", fontsize=16, pad=20)
ax.set_xlabel("Test Number (Most Recent → Earliest)", fontsize=14)
ax.set_ylabel("Time (seconds)", fontsize=14)
ax.grid(True, alpha=0.3)
ax.invert_xaxis()

plt.tight_layout()
plt.show()


def check_significance(r_squared, p_value, alpha=0.05):
    significance = "显著" if p_value < alpha else "不显著"
    return (
        f"决定系数为 {r_squared:.3f}，表示训练次数可以解释分数约 {r_squared * 100:.1f}% 的变异。"
        f"回归系数的 p 值为 {p_value:.3f}，"
        f"说明训练次数对分数的影响在统计上{significance}"
        f"（以 alpha = {alpha} 为阈值）。"
    )


print(f"阅读分数的回归模型：{check_significance(r2_2, pval2)}")
print(f"听力分数的回归模型：{check_significance(r1_2, pval1)}")

No description has been provided for this image

阅读分数的回归模型：决定系数为 0.165，表示训练次数可以解释分数约 16.5% 的变异。回归系数的 p 值为 0.075，说明训练次数对分数的影响在统计上不显著（以 alpha = 0.05 为阈值）。
听力分数的回归模型：决定系数为 0.495，表示训练次数可以解释分数约 49.5% 的变异。回归系数的 p 值为 0.000，说明训练次数对分数的影响在统计上显著（以 alpha = 0.05 为阈值）。

Training Frequency Statistics¶

show the training frequency of my training on IELTS

In [3]:

Copied!





import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# 创建训练数据（与你已有代码一样）
listening_dates = pd.to_datetime(listening_df["date"]).dt.normalize()
reading_dates = pd.to_datetime(reading_df["date"]).dt.normalize()

start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq="D")

training_data = pd.Series(0, index=date_range)
for date in listening_dates:
    training_data[date] += 1
for date in reading_dates:
    training_data[date] += 2

# 颜色设置
colors = {0: "#f5f5f5", 1: "#9ecae1", 2: "#fc9272", 3: "#807dba"}
cmap = ListedColormap([colors[i] for i in range(4)])

# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period("M").unique()

# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols)  # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for i, period in enumerate(nonzero_months):
    ax = axes[i]
    month_data = training_data[training_data.index.to_period("M") == period]

    # 生成月日历矩阵
    month_dates = month_data.index
    days = month_dates.day
    weekdays = month_dates.weekday
    week_of_month = (days - 1 + month_dates[0].weekday()) // 7

    # 画每个格子
    for date, value in month_data.items():
        x = date.weekday()  # 0=Mon
        y = (date.day + date.replace(day=1).weekday() - 1) // 7
        ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))

    # 样式
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 6)
    ax.set_xticks(range(7))
    ax.set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
    ax.set_yticks([])
    ax.set_title(f"{period.strftime('%B %Y')}")
    ax.set_aspect("equal")
    ax.invert_yaxis()

# 移除多余子图
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# 添加图例
legend_elements = [
    Patch(facecolor=colors[1], label="Listening"),
    Patch(facecolor=colors[2], label="Reading"),
    Patch(facecolor=colors[3], label="Both"),
]
fig.legend(handles=legend_elements, loc="upper right")

plt.suptitle("IELTS Training Frequency Calendar", fontsize=16, y=0.98)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch

# 创建训练数据（与你已有代码一样）
listening_dates = pd.to_datetime(listening_df["date"]).dt.normalize()
reading_dates = pd.to_datetime(reading_df["date"]).dt.normalize()

start_date = min(listening_dates.min(), reading_dates.min())
end_date = max(listening_dates.max(), reading_dates.max())
date_range = pd.date_range(start=start_date, end=end_date, freq="D")

training_data = pd.Series(0, index=date_range)
for date in listening_dates:
    training_data[date] += 1
for date in reading_dates:
    training_data[date] += 2

# 颜色设置
colors = {0: "#f5f5f5", 1: "#9ecae1", 2: "#fc9272", 3: "#807dba"}
cmap = ListedColormap([colors[i] for i in range(4)])

# 提取非零月份
nonzero_months = training_data[training_data > 0].index.to_period("M").unique()

# 按月绘图
n_cols = 3
n_rows = -(-len(nonzero_months) // n_cols)  # 向上取整
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for i, period in enumerate(nonzero_months):
    ax = axes[i]
    month_data = training_data[training_data.index.to_period("M") == period]

    # 生成月日历矩阵
    month_dates = month_data.index
    days = month_dates.day
    weekdays = month_dates.weekday
    week_of_month = (days - 1 + month_dates[0].weekday()) // 7

    # 画每个格子
    for date, value in month_data.items():
        x = date.weekday()  # 0=Mon
        y = (date.day + date.replace(day=1).weekday() - 1) // 7
        ax.add_patch(plt.Rectangle((x, y), 1, 1, color=colors[value]))

    # 样式
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 6)
    ax.set_xticks(range(7))
    ax.set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
    ax.set_yticks([])
    ax.set_title(f"{period.strftime('%B %Y')}")
    ax.set_aspect("equal")
    ax.invert_yaxis()

# 移除多余子图
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# 添加图例
legend_elements = [
    Patch(facecolor=colors[1], label="Listening"),
    Patch(facecolor=colors[2], label="Reading"),
    Patch(facecolor=colors[3], label="Both"),
]
fig.legend(handles=legend_elements, loc="upper right")

plt.suptitle("IELTS Training Frequency Calendar", fontsize=16, y=0.98)
plt.tight_layout()
plt.show()

Recent Performance Analysis¶

Detailed comparison of the most recent 7 tests, showing:

Direct score comparison between Listening and Reading
Band scores for each test
Clear visualization of recent trends

In [4]:

Copied!





# Create recent performance comparison
plt.figure(figsize=(15, 8))

# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)

# Set up bar positions
x = np.arange(7)
width = 0.35

# Create bars
plt.bar(
    x - width / 2, recent_listening["score"], width, label="Listening", color="skyblue", alpha=0.7
)
plt.bar(
    x + width / 2, recent_reading["score"], width, label="Reading", color="lightcoral", alpha=0.7
)

# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening["score"], recent_listening["band"])):
    plt.text(
        i - width / 2, score + 0.5, f"{score}\n(B{band})", ha="center", va="bottom", fontsize=10
    )
for i, (score, band) in enumerate(zip(recent_reading["score"], recent_reading["band"])):
    plt.text(
        i + width / 2, score + 0.5, f"{score}\n(B{band})", ha="center", va="bottom", fontsize=10
    )

# Configure plot
plt.title("Recent IELTS Test Performance Comparison", fontsize=16, pad=20)
plt.xlabel("Test Number (Most Recent → Earliest)", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()
# Create recent performance comparison
plt.figure(figsize=(15, 8))

# Get recent test data
recent_listening = listening_df.head(7)
recent_reading = reading_df.head(7)

# Set up bar positions
x = np.arange(7)
width = 0.35

# Create bars
plt.bar(
    x - width / 2, recent_listening["score"], width, label="Listening", color="skyblue", alpha=0.7
)
plt.bar(
    x + width / 2, recent_reading["score"], width, label="Reading", color="lightcoral", alpha=0.7
)

# Add score labels with band scores
for i, (score, band) in enumerate(zip(recent_listening["score"], recent_listening["band"])):
    plt.text(
        i - width / 2, score + 0.5, f"{score}\n(B{band})", ha="center", va="bottom", fontsize=10
    )
for i, (score, band) in enumerate(zip(recent_reading["score"], recent_reading["band"])):
    plt.text(
        i + width / 2, score + 0.5, f"{score}\n(B{band})", ha="center", va="bottom", fontsize=10
    )

# Configure plot
plt.title("Recent IELTS Test Performance Comparison", fontsize=16, pad=20)
plt.xlabel("Test Number (Most Recent → Earliest)", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.legend(fontsize=12)
plt.xticks(x, range(1, 8))
plt.ylim(20, 40)
plt.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

Statistical Summary¶

Key statistics for both Listening and Reading sections:

In [5]:

Copied!





# Calculate and display statistics
def print_section_stats(df, section):
    """Print statistics for a test section"""
    print(f"{section} Statistics:")
    print("-" * 50)
    print(f"Average Score: {df['score'].mean():.2f}")
    print(f"Average Band: {df['band'].mean():.2f}")
    print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
    print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
    print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")


print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")
# Calculate and display statistics
def print_section_stats(df, section):
    """Print statistics for a test section"""
    print(f"{section} Statistics:")
    print("-" * 50)
    print(f"Average Score: {df['score'].mean():.2f}")
    print(f"Average Band: {df['band'].mean():.2f}")
    print(f"Highest Score: {df['score'].max()} (Band {df.loc[df['score'].idxmax(), 'band']})")
    print(f"Lowest Score: {df['score'].min()} (Band {df.loc[df['score'].idxmin(), 'band']})")
    print(f"Average Completion Time: {df['time'].mean():.2f} seconds\n")


print_section_stats(listening_df, "Listening")
print_section_stats(reading_df, "Reading")

Listening Statistics:
--------------------------------------------------
Average Score: 30.50
Average Band: 6.98
Highest Score: 37 (Band 8.5)
Lowest Score: 25 (Band 6.0)
Average Completion Time: 1629.04 seconds

Reading Statistics:
--------------------------------------------------
Average Score: 34.15
Average Band: 7.72
Highest Score: 38 (Band 8.5)
Lowest Score: 28 (Band 6.5)
Average Completion Time: 3210.55 seconds