rows, cols = df.shape
mem_bytes = int(df.memory_usage(deep=True).sum())
mem_mb = round(mem_bytes / (1024*1024), 2)
| Column | Flags | Type | Unique Values | Unique Ratio | Missing (%) |
|---|---|---|---|---|---|
| odometer | id-like | float64 | 10700 | 0.957323 | 0.0 |
| vin | id-like | str | 11175 | 0.999821 | 0.0 |
# Build Integrity Notes lists and full flagged table
flag_rows = []
for c in df.columns:
f = []
if c in id_like: f.append('id-like')
if c in hi_card: f.append('high-card')
if c in consts: f.append('constant')
if f:
flag_rows.append({
'Column': c,
'Flags': ', '.join(f),
'Type': dtypes[c],
'Unique Values': int(uniques.get(c,0)),
'Unique Ratio': float(uniq_ratio.get(c,0)),
'Missing (%)': float(miss_pct.get(c,0))
})
flagged_df = pd.DataFrame(flag_rows)
flagged_df
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 | Chrysler | PT Cruiser | Classic | Wagon | automatic | 3a4gy5f97at172213 | tn | 26 | 67969 | gray | gray | tdaf remarketing | 5225 | 5500 | Wed Jun 03 2015 03:30:00 GMT-0700 (PDT) |
| 2008 | Nissan | Altima | 3.5 SE | Sedan | automatic | 1n4bl21ex8c168024 | pa | 2 | 126302 | gray | black | valley national bank | 5625 | 3800 | Tue May 26 2015 02:30:00 GMT-0700 (PDT) |
| 2012 | Jeep | Liberty | Sport | SUV | automatic | 1c4pjlak6cw100859 | tx | 2 | 68499 | green | black | north america financial | 10300 | 7900 | Tue Jan 20 2015 02:30:00 GMT-0800 (PST) |
| 2009 | Jeep | Wrangler | Unlimited X | suv | automatic | 1j4ga391x9l706276 | fl | 36 | 63592 | yellow | gray | tdaf remarketing | 17800 | 19400 | Mon Jun 08 2015 06:00:00 GMT-0700 (PDT) |
| 2013 | Nissan | Juke | S | Hatchback | automatic | jn8af5mrxdt213006 | tx | 39 | 33185 | white | black | nissan motor acceptance corporation | 13800 | 13000 | Wed May 27 2015 03:30:00 GMT-0700 (PDT) |
| 2013 | BMW | 5 Series | 528i | Sedan | automatic | wbaxg5c51ddy30612 | tn | 42 | 18011 | white | gray | harold mathews nissan inc | 43200 | 32000 | Wed Jun 24 2015 03:00:00 GMT-0700 (PDT) |
| 2008 | Audi | A4 | 2.0T | Sedan | automatic | wauaf78e48a080112 | nc | 37 | 103193 | blue | black | hendrick toyota scion concord | 8100 | 8100 | Wed Feb 25 2015 06:00:00 GMT-0800 (PST) |
| 2009 | Mercedes-Benz | C-Class | C300 Luxury | sedan | automatic | wddgf54x29r044131 | tx | 41 | 99506 | gray | beige | mercedes-benz of houston north | 12300 | 10200 | Thu May 28 2015 03:30:00 GMT-0700 (PDT) |
df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']].head(8)
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2002 | Honda | Accord | EX | Coupe | automatic | 1hgcg32772a025786 | nc | 24.0 | 107170.0 | black | gray | princeton bmw | 2025.0 | 2300.0 | Thu May 28 2015 07:00:00 GMT-0700 (PDT) |
| 2013 | Toyota | Camry | LE | sedan | automatic | 4t4bf1fk3dr285962 | az | 29.0 | 55079.0 | gray | gray | wells fargo dealer services | 11400.0 | 12300.0 | Thu Jun 11 2015 05:00:00 GMT-0700 (PDT) |
| 2009 | Lexus | ES 350 | Base | Sedan | automatic | jthbj46g392302416 | pa | 4.0 | 79514.0 | red | black | car sense inc | 13300.0 | 14150.0 | Tue Jan 13 2015 09:30:00 GMT-0800 (PST) |
| 2012 | Infiniti | G Sedan | G37 Journey | G Sedan | automatic | jn1cv6ap5cm628584 | ca | 47.0 | 13601.0 | blue | black | nissan infiniti lt | 22700.0 | 23750.0 | Thu Jan 15 2015 04:30:00 GMT-0800 (PST) |
| 2011 | Infiniti | G Sedan | G37x | G Sedan | automatic | jn1cv6ar8bm355821 | tn | 36.0 | 31282.0 | white | gray | nissan infiniti lt | 18500.0 | 18800.0 | Wed Feb 18 2015 02:30:00 GMT-0800 (PST) |
| 2014 | Kia | Optima | LX | Sedan | automatic | 5xxgm4a70eg348893 | oh | 47.0 | 16250.0 | black | gray | kia motors america inc | 15450.0 | 14400.0 | Tue Feb 17 2015 01:30:00 GMT-0800 (PST) |
| 2014 | Hyundai | Tucson | GLS | SUV | NaN | km8jt3af7eu818627 | pa | 26.0 | 18454.0 | black | brown | hyundai motor finance | 15400.0 | 14300.0 | Fri Feb 13 2015 01:00:00 GMT-0800 (PST) |
| 2014 | Chevrolet | Captiva Sport | LT Fleet | SUV | automatic | 3gnal3ek9es638262 | il | 3.0 | 13517.0 | black | gray | gm remarketing | 16850.0 | 16200.0 | Tue Jan 13 2015 10:00:00 GMT-0800 (PST) |
n = len(df)
start_index = max(0, floor(n / 2) - 4)
df.iloc[start_index : start_index + 8][['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']]
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2012 | Infiniti | EX | EX35 | SUV | automatic | jn1aj0hp5cm400025 | ca | 26 | 33028 | black | black | nissan infiniti lt | 23800 | 20500 | Tue May 19 2015 05:30:00 GMT-0700 (PDT) |
| 2013 | Hyundai | Sonata | SE | Sedan | automatic | 5npec4ab6dh546069 | fl | 4 | 92921 | blue | black | union leasing exchange llc | 10750 | 10500 | Wed Jun 17 2015 04:15:00 GMT-0700 (PDT) |
| 2012 | Ford | Explorer | XLT | SUV | automatic | 1fmhk7d83cga24404 | wi | 41 | 77092 | black | beige | sutton ford/ge | 17850 | 16500 | Wed Jan 07 2015 10:00:00 GMT-0800 (PST) |
| 2012 | Nissan | Altima | 2.5 S | Sedan | automatic | 1n4al2ap9cc111819 | ga | 41 | 33829 | white | black | nissan-infiniti lt | 12150 | 11400 | Tue Jan 13 2015 09:30:00 GMT-0800 (PST) |
| 2008 | Subaru | Outback | 2.5 XT Limited | Wagon | manual | 4s4bp63c886349280 | nj | 1 | 96230 | gray | black | a & a quality services inc | 10050 | 8100 | Wed Feb 11 2015 04:00:00 GMT-0800 (PST) |
| 2013 | Toyota | Corolla | LE | Sedan | automatic | 5yfbu4ee3dp152018 | fl | 48 | 17983 | white | gray | world omni financial corporation | 12550 | 13400 | Tue Jan 27 2015 01:30:00 GMT-0800 (PST) |
| 2014 | Ford | Fusion | SE | Sedan | automatic | 3fa6p0h77er138056 | tn | 27 | 33826 | white | — | ars/avis budget group | 15100 | 14000 | Wed Jan 28 2015 02:00:00 GMT-0800 (PST) |
| 2013 | Ford | F-150 | XLT | supercrew | automatic | 1ftfw1ef8dke54566 | tx | 29 | 105715 | white | gray | pm standley motorcars | 20300 | 19500 | Wed Jun 03 2015 03:20:00 GMT-0700 (PDT) |
df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']].tail(8)
| year | condition | odometer | mmr | sellingprice |
|---|---|---|---|---|
| 11177.000000 | 10957.000000 | 11176.000000 | 11177.000000 | 11177.000000 |
| 2010.005816 | 30.642329 | 69088.261095 | 13650.476425 | 13501.447079 |
| 3.988358 | 13.316668 | 53556.318592 | 9516.423115 | 9557.044552 |
| 1991.000000 | 1.000000 | 1.000000 | 25.000000 | 200.000000 |
| 2007.000000 | 23.000000 | 28394.250000 | 7025.000000 | 6900.000000 |
| 2012.000000 | 34.000000 | 53031.000000 | 12150.000000 | 12100.000000 |
| 2013.000000 | 41.000000 | 99378.750000 | 18150.000000 | 18000.000000 |
| 2015.000000 | 49.000000 | 999999.000000 | 102000.000000 | 103000.000000 |
df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']].describe()
| Column | Type | Non-Null Count | Missing (%) | Unique Values | Sample Values | Flags |
|---|---|---|---|---|---|---|
| year | int64 | 11177 | 0.0 | 25 | 2010, 2008, 2012 | |
| make | str | 10968 | 1.9 | 64 | Chrysler, Nissan, Jeep | |
| model | str | 10968 | 1.9 | 561 | PT Cruiser, Altima, Liberty | |
| trim | str | 10962 | 1.9 | 736 | Classic, 3.5 SE, Sport | |
| body | str | 10912 | 2.4 | 54 | Wagon, Sedan, SUV | |
| transmission | str | 9904 | 11.4 | 2 | automatic, manual | |
| vin | str | 11177 | 0.0 | 11175 | 3a4gy5f97at172213, 1n4bl21ex8c168024, 1c4pjlak6cw100859 | id-like |
| state | str | 11177 | 0.0 | 35 | tn, pa, tx | |
| condition | float64 | 10957 | 2.0 | 40 | 26.0, 2.0, 36.0 | |
| odometer | float64 | 11176 | 0.0 | 10700 | 67969.0, 126302.0, 68499.0 | id-like |
| color | str | 11160 | 0.2 | 18 | gray, green, yellow | |
| interior | str | 11160 | 0.2 | 17 | gray, black, beige | |
| seller | str | 11177 | 0.0 | 2423 | tdaf remarketing, valley national bank, north america financial | |
| mmr | float64 | 11177 | 0.0 | 908 | 5225.0, 5625.0, 10300.0 | |
| sellingprice | float64 | 11177 | 0.0 | 653 | 5500.0, 3800.0, 7900.0 | |
| saledate | str | 11177 | 0.0 | 1480 | Wed Jun 03 2015 03:30:00 GMT-0700 (PDT), Tue May 26 2015 02:30:00 GMT-0700 (PDT), Tue Jan 20 2015 02:30:00 GMT-0800 (PST) |
dtypes = df.dtypes.astype(str)
nonnull = df.notnull().sum()
miss_pct = (df.isnull().mean()*100).round(1)
uniques = df.nunique(dropna=True)
n = max(len(df), 1)
uniq_ratio = (uniques / n).fillna(0.0)
def _sample_vals(s, k=3):
vals = pd.unique(s.dropna().astype(str))[:k]
return ', '.join(map(str, vals)) if len(vals) else ''
flags_col = []
for c in df.columns:
flags=[]
if uniques.get(c,0) <= 1: flags.append('constant')
if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')
if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')
flags_col.append(', '.join(flags))
profile_df = pd.DataFrame({
'Column': df.columns,
'Type': dtypes.values,
'Non-Null Count': nonnull.values,
'Missing (%)': miss_pct.values,
'Unique Values': uniques.values,
'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],
'Flags': flags_col
})
profile_df
| Column | Type | Non-Null Count | Unique Values |
|---|---|---|---|
| year | int64 | 11177 | 25 |
| make | str | 10968 | 64 |
| model | str | 10968 | 561 |
| trim | str | 10962 | 736 |
| body | str | 10912 | 54 |
| transmission | str | 9904 | 2 |
| vin | str | 11177 | 11175 |
| state | str | 11177 | 35 |
| condition | float64 | 10957 | 40 |
| odometer | float64 | 11176 | 10700 |
| color | str | 11160 | 18 |
| interior | str | 11160 | 17 |
| seller | str | 11177 | 2423 |
| mmr | float64 | 11177 | 908 |
| sellingprice | float64 | 11177 | 653 |
| saledate | str | 11177 | 1480 |
pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes.astype(str).values,
'Non-Null Count': df.notnull().sum().values,
'Unique Values': df.nunique().values
})
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2015 | Mercedes-Benz | S-Class | S550 | Sedan | automatic | wddug8cb9fa069559 | ca | 44.0 | 9218.0 | white | black | fast cars | 95000.0 | 94750.0 | Thu May 21 2015 05:30:00 GMT-0700 (PDT) |
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2014 | Land Rover | Range Rover | HSE | SUV | automatic | salgs2vf1ea126170 | ca | 48.0 | 33995.0 | black | black | harrison imports llc | 82000.0 | 82250.0 | Thu Feb 12 2015 04:30:00 GMT-0800 (PST) |
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | Porsche | 911 | Carrera 4S | Coupe | automatic | wp0ab2a91ds121816 | fl | 39.0 | 22476.0 | gray | black | porsche financial services | 82000.0 | 79000.0 | Tue Jun 02 2015 05:00:00 GMT-0700 (PDT) |
num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]
df_num = df[num_cols]
df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num
med = df_num.median(); mad = (df_num - med).abs().median()
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
row_score = rz.abs().max(axis=1)
top3 = row_score.nlargest(3)
dfn = df.select_dtypes(include='number').iloc[:, :max_cols]
dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn
# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box
| Column | Missing Values | Missing (%) |
|---|---|---|
| make | 209 | 1.9 |
| model | 209 | 1.9 |
| trim | 215 | 1.9 |
| body | 265 | 2.4 |
| transmission | 1273 | 11.4 |
| condition | 220 | 2.0 |
| odometer | 1 | 0.0 |
| color | 17 | 0.2 |
| interior | 17 | 0.2 |
nulls = df.isnull().sum()
nulls_pct = (df.isnull().mean() * 100).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})\nmissing_df[missing_df['Missing Values'] > 0]
nulls = df.isnull().sum();
nulls_pct = (
df.isnull().mean()*100
).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})
top_miss = (
missing_df[missing_df['Missing Values'] > 0]
.sort_values('Missing (%)', ascending=False)
.loc[:, ['Column', 'Missing (%)']]
.head(20)
.reset_index(drop=True)
)
top_miss
dist_col = '<chosen categorical>'
s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')
vc = s.value_counts(dropna=False)
top_k = 8 # Top-8 + Other (+ Missing)