import pandas as pd
import numpy as np
import pyreadstat
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

# Load the practice data
df, meta = pyreadstat.read_sav('SPSSAssignment1_Practice_Data.sav')
print(f'Dataset: {df.shape[0]} observations, {df.shape[1]} variables')
df.head()

Dataset: 384 observations, 19 variables

# Means and standard deviations for all variables
desc = df.describe().T[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
desc = desc.round(3)
display(desc)

# Frequency distributions for the scale items
sect_items = ['sect16_2', 'sect17_2_r', 'sect18_2', 'sect19_2', 'sect20_2_r', 'sect21_2', 'sect22_2', 'sect23_2']
sft_items = ['sft70_2', 'sft71_2', 'sft72_2_r', 'sft73_2', 'sft74_2']

print('=== SECT Item Frequencies ===')
for item in sect_items:
    print(f'\n{item}:')
    print(df[item].value_counts().sort_index())

print('\n=== SFT Item Frequencies ===')
for item in sft_items:
    print(f'\n{item}:')
    print(df[item].value_counts().sort_index())

=== SECT Item Frequencies ===

sect16_2:
sect16_2
1.0     56
2.0     41
3.0     55
4.0    103
5.0    129
Name: count, dtype: int64

sect17_2_r:
sect17_2_r
1.0    54
2.0    80
3.0    78
4.0    95
5.0    77
Name: count, dtype: int64

sect18_2:
sect18_2
1.0     26
2.0     33
3.0    117
4.0    102
5.0    105
9.0      1
Name: count, dtype: int64

sect19_2:
sect19_2
1.0     27
2.0     45
3.0    101
4.0    100
5.0    111
Name: count, dtype: int64

sect20_2_r:
sect20_2_r
1.0    59
2.0    64
3.0    89
4.0    73
5.0    99
Name: count, dtype: int64

sect21_2:
sect21_2
1.0    51
2.0    84
3.0    97
4.0    66
5.0    79
Name: count, dtype: int64

sect22_2:
sect22_2
1.0     17
2.0     28
3.0     63
4.0     90
5.0    186
Name: count, dtype: int64

sect23_2:
sect23_2
1.0     20
2.0     15
3.0     94
4.0    114
5.0    141
Name: count, dtype: int64

=== SFT Item Frequencies ===

sft70_2:
sft70_2
1.0     46
2.0     67
3.0    139
4.0     63
5.0     69
Name: count, dtype: int64

sft71_2:
sft71_2
0.0      1
1.0     50
2.0     76
3.0    127
4.0     76
5.0     54
Name: count, dtype: int64

sft72_2_r:
sft72_2_r
1.0     53
2.0     55
3.0    106
4.0    102
5.0     68
Name: count, dtype: int64

sft73_2:
sft73_2
1.0     57
2.0     52
3.0    124
4.0     71
5.0     80
Name: count, dtype: int64

sft74_2:
sft74_2
1.0     21
2.0     41
3.0    103
4.0     81
5.0    125
Name: count, dtype: int64

# Missing data summary
missing = pd.DataFrame({
    'Valid': df.count(),
    'Missing': df.isnull().sum(),
    'Pct_Missing': (df.isnull().sum() / len(df) * 100).round(2)
})
display(missing[missing['Missing'] > 0])
print(f'\nVariables with no missing data: {(missing["Missing"] == 0).sum()} of {len(missing)}')

Variables with no missing data: 15 of 19

# Check for non-plausible values in scale items (expected range: 1-5)
all_items = sect_items + sft_items
print('=== Non-Plausible Values (outside 1-5 range) ===')
for item in all_items:
    out_of_range = df[(df[item] < 1) | (df[item] > 5)][item]
    if len(out_of_range) > 0:
        print(f'\n  {item}: {len(out_of_range)} case(s)')
        print(f'    Values: {out_of_range.values}')
        print(f'    IDs: {df.loc[out_of_range.index, "ID"].values}')

=== Non-Plausible Values (outside 1-5 range) ===

  sect18_2: 1 case(s)
    Values: [9.]
    IDs: [3102.]

  sft71_2: 1 case(s)
    Values: [0.]
    IDs: [3334.]

# Boxplots for scale items
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df[sect_items].boxplot(ax=axes[0], rot=45)
axes[0].set_title('SECT Items')
axes[0].set_ylabel('Response Value')

df[sft_items].boxplot(ax=axes[1], rot=45)
axes[1].set_title('SFT Items')
axes[1].set_ylabel('Response Value')

plt.tight_layout()
plt.show()

# Boxplots for outcome variables
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

for ax, var in zip(axes, ['Exam_Score', 'Homework_Score', 'CourseCredit']):
    df[[var]].boxplot(ax=ax)
    ax.set_title(var)

plt.tight_layout()
plt.show()

# Recode non-plausible values to NaN
print('Before cleaning:')
print(f'  sect18_2 max: {df["sect18_2"].max()}')
print(f'  sft71_2 min: {df["sft71_2"].min()}')

# sect18_2: value of 9 is impossible on a 1-5 scale
df.loc[df['sect18_2'] > 5, 'sect18_2'] = np.nan

# sft71_2: value of 0 is impossible on a 1-5 scale
df.loc[df['sft71_2'] < 1, 'sft71_2'] = np.nan

print('\nAfter cleaning:')
print(f'  sect18_2 max: {df["sect18_2"].max()}, missing: {df["sect18_2"].isnull().sum()}')
print(f'  sft71_2 min: {df["sft71_2"].min()}, missing: {df["sft71_2"].isnull().sum()}')

Before cleaning:
  sect18_2 max: 9.0
  sft71_2 min: 0.0

After cleaning:
  sect18_2 max: 5.0, missing: 1
  sft71_2 min: 1.0, missing: 1

def cronbachs_alpha(items_df):
    """Calculate Cronbach's alpha for a set of items."""
    items = items_df.dropna()
    k = items.shape[1]
    item_vars = items.var(ddof=1)
    total_var = items.sum(axis=1).var(ddof=1)
    alpha = (k / (k - 1)) * (1 - item_vars.sum() / total_var)
    return alpha

def alpha_if_deleted(items_df):
    """Calculate alpha-if-item-deleted and corrected item-total correlations."""
    items = items_df.dropna()
    results = []
    total = items.sum(axis=1)
    
    for col in items.columns:
        # Alpha if this item is deleted
        remaining = items.drop(columns=[col])
        a = cronbachs_alpha(remaining)
        
        # Corrected item-total correlation (correlation with total minus this item)
        corrected_total = total - items[col]
        r = items[col].corr(corrected_total)
        
        results.append({'Item': col, 'Alpha_if_Deleted': round(a, 4), 'Corrected_Item_Total_r': round(r, 4)})
    
    return pd.DataFrame(results)

# SECT scale reliability (after cleaning non-plausible values)
sect_alpha = cronbachs_alpha(df[sect_items])
print(f'SECT Scale — Cronbach\'s Alpha: {sect_alpha:.4f}')
print(f'Number of items: {len(sect_items)}')
print()

sect_aid = alpha_if_deleted(df[sect_items])
display(sect_aid)

SECT Scale — Cronbach's Alpha: 0.8564
Number of items: 8

# SFT scale reliability (after cleaning non-plausible values)
sft_alpha = cronbachs_alpha(df[sft_items])
print(f'SFT Scale — Cronbach\'s Alpha: {sft_alpha:.4f}')
print(f'Number of items: {len(sft_items)}')
print()

sft_aid = alpha_if_deleted(df[sft_items])
display(sft_aid)

SFT Scale — Cronbach's Alpha: 0.6626
Number of items: 5

# Mean scale scores (recommended approach)
df['SECT_Mean'] = df[sect_items].mean(axis=1)
df['SFT_Mean'] = df[sft_items].mean(axis=1)

print('=== New Scale Score Descriptives ===')
display(df[['SECT_Mean', 'SFT_Mean']].describe().round(3))

=== New Scale Score Descriptives ===

# Check scale scores for missing data and outliers
print('=== Scale Score Missing Data ===')
for scale in ['SECT_Mean', 'SFT_Mean']:
    n_missing = df[scale].isnull().sum()
    print(f'{scale}: {n_missing} missing ({n_missing/len(df)*100:.1f}%)')

print()

# Boxplots of final scale scores
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
df[['SECT_Mean']].boxplot(ax=axes[0])
axes[0].set_title('SECT Mean Score')
df[['SFT_Mean']].boxplot(ax=axes[1])
axes[1].set_title('SFT Mean Score')
plt.tight_layout()
plt.show()

=== Scale Score Missing Data ===
SECT_Mean: 0 missing (0.0%)
SFT_Mean: 0 missing (0.0%)

Variable Group	Variables	Description
SECT items	`sect16_2` – `sect23_2` (8 items)	Scale items; `_r` = reverse-coded
SFT items	`sft70_2` – `sft74_2` (5 items)	Scale items; `_r` = reverse-coded
Pre-computed scales	`Anxiety`, `InformationProcessing`	Already-calculated scale scores
Outcomes	`Exam_Score`, `Homework_Score`, `CourseCredit`	Performance measures

	count	mean	std	min	25%	50%	75%	max
ID	384.0	3192.500	110.995	3001.000	3096.750	3192.500	3288.250	3384.0
sect16_2	384.0	3.542	1.419	1.000	2.000	4.000	5.000	5.0
sect17_2_r	384.0	3.159	1.342	1.000	2.000	3.000	4.000	5.0
sect18_2	384.0	3.607	1.202	1.000	3.000	4.000	5.000	9.0
sect19_2	384.0	3.581	1.217	1.000	3.000	4.000	5.000	5.0
sect20_2_r	384.0	3.232	1.398	1.000	2.000	3.000	5.000	5.0
sect21_2	377.0	3.101	1.331	1.000	2.000	3.000	4.000	5.0
sect22_2	384.0	4.042	1.158	1.000	3.000	4.000	5.000	5.0
sect23_2	384.0	3.888	1.108	1.000	3.000	4.000	5.000	5.0
sft70_2	384.0	3.109	1.236	1.000	2.000	3.000	4.000	5.0
sft71_2	384.0	3.013	1.227	0.000	2.000	3.000	4.000	5.0
sft72_2_r	384.0	3.201	1.278	1.000	2.000	3.000	4.000	5.0
sft73_2	384.0	3.169	1.313	1.000	2.000	3.000	4.000	5.0
sft74_2	371.0	3.668	1.208	1.000	3.000	4.000	5.000	5.0
Anxiety	384.0	2.627	1.082	1.000	1.778	2.611	3.361	5.0
InformationProcessing	384.0	3.346	0.745	1.111	2.778	3.333	3.889	5.0
Exam_Score	354.0	64.500	19.595	11.363	50.649	63.806	78.666	115.0
Homework_Score	379.0	55.774	26.492	0.000	38.324	56.464	73.001	150.0
CourseCredit	384.0	0.451	0.498	0.000	0.000	0.000	1.000	1.0

	Valid	Missing	Pct_Missing
sect21_2	377	7	1.82
sft74_2	371	13	3.39
Exam_Score	354	30	7.81
Homework_Score	379	5	1.30

	Item	Alpha_if_Deleted	Corrected_Item_Total_r
0	sect16_2	0.8422	0.5807
1	sect17_2_r	0.8358	0.6276
2	sect18_2	0.8344	0.6457
3	sect19_2	0.8411	0.5828
4	sect20_2_r	0.8399	0.5965
5	sect21_2	0.8406	0.5884
6	sect22_2	0.8393	0.6016
7	sect23_2	0.8403	0.5951

	Item	Alpha_if_Deleted	Corrected_Item_Total_r
0	sft70_2	0.6082	0.4215
1	sft71_2	0.6056	0.4274
2	sft72_2_r	0.6247	0.3860
3	sft73_2	0.6103	0.4179
4	sft74_2	0.6065	0.4259

SPSS Assignment 1: Data Quality and Reliability Analysis¶

1. Data Overview¶

2. Descriptive Statistics (Means, SDs, Frequencies)¶

Interpretation¶

3. Missing Data¶

Interpretation¶

4. Non-Plausible Values and Outliers¶

Interpretation¶

5. Reliability Analysis (Cronbach's Alpha)¶

6. Interpretation: Alpha if Item Deleted¶

7. Create Scale Scores¶

8. Final Verification¶

Summary and Interpretation¶

Data Quality¶

Reliability¶

Scale Scores¶

Deliverables¶

AI Use Disclosure¶

	ID	sect16_2	sect17_2_r	sect18_2	sect19_2	sect20_2_r	sect21_2	sect22_2	sect23_2	sft70_2	sft71_2	sft72_2_r	sft73_2	sft74_2	Anxiety	InformationProcessing	Exam_Score	Homework_Score	CourseCredit
0	3001.0	4.0	4.0	5.0	4.0	4.0	5.0	4.0	4.0	3.0	2.0	1.0	3.0	3.0	2.888889	2.777778	77.163067	44.218159	1.0
1	3002.0	3.0	3.0	4.0	4.0	3.0	4.0	4.0	4.0	3.0	5.0	5.0	3.0	3.0	1.888889	4.000000	58.389741	72.248523	0.0
2	3003.0	5.0	5.0	4.0	5.0	5.0	5.0	5.0	5.0	3.0	4.0	2.0	3.0	3.0	5.000000	2.777778	84.329808	98.737232	1.0
3	3004.0	5.0	5.0	5.0	4.0	5.0	4.0	5.0	4.0	4.0	2.0	5.0	3.0	4.0	2.888889	3.444444	71.864053	92.913460	1.0
4	3005.0	1.0	1.0	3.0	2.0	3.0	2.0	3.0	4.0	5.0	5.0	4.0	3.0	5.0	1.111111	4.666667	47.051978	NaN	0.0

	SECT_Mean	SFT_Mean
count	384.000	384.000
mean	3.518	3.229
std	0.900	0.819
min	1.125	1.400
25%	2.875	2.600
50%	3.625	3.200
75%	4.250	3.800
max	5.000	5.000

Deliverable	Status
Descriptive statistics table	Section 2
Reliability output (Cronbach's alpha)	Section 5
Interpretation of weak items	Section 6
Final scale scores	Section 7–8