import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
C:\Users\Yasser\New folder\Lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
from sklearn.datasets import fetch_openml
boston = fetch_openml(name="boston", version=1, as_frame=True)
df = boston.data
df['Price'] = boston.target
df[:10]
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
5 | 0.02985 | 0.0 | 2.18 | 0 | 0.458 | 6.430 | 58.7 | 6.0622 | 3 | 222.0 | 18.7 | 394.12 | 5.21 | 28.7 |
6 | 0.08829 | 12.5 | 7.87 | 0 | 0.524 | 6.012 | 66.6 | 5.5605 | 5 | 311.0 | 15.2 | 395.60 | 12.43 | 22.9 |
7 | 0.14455 | 12.5 | 7.87 | 0 | 0.524 | 6.172 | 96.1 | 5.9505 | 5 | 311.0 | 15.2 | 396.90 | 19.15 | 27.1 |
8 | 0.21124 | 12.5 | 7.87 | 0 | 0.524 | 5.631 | 100.0 | 6.0821 | 5 | 311.0 | 15.2 | 386.63 | 29.93 | 16.5 |
9 | 0.17004 | 12.5 | 7.87 | 0 | 0.524 | 6.004 | 85.9 | 6.5921 | 5 | 311.0 | 15.2 | 386.71 | 17.10 | 18.9 |
print(boston.DESCR)
**Author**: **Source**: Unknown - Date unknown **Please cite**: The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter. Variables in order: CRIM per capita crime rate by town ZN proportion of residential land zoned for lots over 25,000 sq.ft. INDUS proportion of non-retail business acres per town CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) NOX nitric oxides concentration (parts per 10 million) RM average number of rooms per dwelling AGE proportion of owner-occupied units built prior to 1940 DIS weighted distances to five Boston employment centres RAD index of accessibility to radial highways TAX full-value property-tax rate per $10,000 PTRATIO pupil-teacher ratio by town B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town LSTAT % lower status of the population MEDV Median value of owner-occupied homes in $1000's Information about the dataset CLASSTYPE: numeric CLASSINDEX: last Downloaded from openml.org.
# %pip install scikit-learn
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CRIM 506 non-null float64 1 ZN 506 non-null float64 2 INDUS 506 non-null float64 3 CHAS 506 non-null category 4 NOX 506 non-null float64 5 RM 506 non-null float64 6 AGE 506 non-null float64 7 DIS 506 non-null float64 8 RAD 506 non-null category 9 TAX 506 non-null float64 10 PTRATIO 506 non-null float64 11 B 506 non-null float64 12 LSTAT 506 non-null float64 13 Price 506 non-null float64 dtypes: category(2), float64(12) memory usage: 49.0 KB
df.shape
(506, 14)
feature_names = df.columns.tolist()
print(feature_names)
['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'Price']
print(df.dtypes)
CRIM float64 ZN float64 INDUS float64 CHAS category NOX float64 RM float64 AGE float64 DIS float64 RAD category TAX float64 PTRATIO float64 B float64 LSTAT float64 Price float64 dtype: object
df.isnull().sum()
CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 B 0 LSTAT 0 Price 0 dtype: int64
df.corr()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CRIM | 1.000000 | -0.200469 | 0.406583 | -0.055892 | 0.420972 | -0.219247 | 0.352734 | -0.379670 | 0.625505 | 0.582764 | 0.289946 | -0.385064 | 0.455621 | -0.388305 |
ZN | -0.200469 | 1.000000 | -0.533828 | -0.042697 | -0.516604 | 0.311991 | -0.569537 | 0.664408 | -0.311948 | -0.314563 | -0.391679 | 0.175520 | -0.412995 | 0.360445 |
INDUS | 0.406583 | -0.533828 | 1.000000 | 0.062938 | 0.763651 | -0.391676 | 0.644779 | -0.708027 | 0.595129 | 0.720760 | 0.383248 | -0.356977 | 0.603800 | -0.483725 |
CHAS | -0.055892 | -0.042697 | 0.062938 | 1.000000 | 0.091203 | 0.091251 | 0.086518 | -0.099176 | -0.007368 | -0.035587 | -0.121515 | 0.048788 | -0.053929 | 0.175260 |
NOX | 0.420972 | -0.516604 | 0.763651 | 0.091203 | 1.000000 | -0.302188 | 0.731470 | -0.769230 | 0.611441 | 0.668023 | 0.188933 | -0.380051 | 0.590879 | -0.427321 |
RM | -0.219247 | 0.311991 | -0.391676 | 0.091251 | -0.302188 | 1.000000 | -0.240265 | 0.205246 | -0.209847 | -0.292048 | -0.355501 | 0.128069 | -0.613808 | 0.695360 |
AGE | 0.352734 | -0.569537 | 0.644779 | 0.086518 | 0.731470 | -0.240265 | 1.000000 | -0.747881 | 0.456022 | 0.506456 | 0.261515 | -0.273534 | 0.602339 | -0.376955 |
DIS | -0.379670 | 0.664408 | -0.708027 | -0.099176 | -0.769230 | 0.205246 | -0.747881 | 1.000000 | -0.494588 | -0.534432 | -0.232471 | 0.291512 | -0.496996 | 0.249929 |
RAD | 0.625505 | -0.311948 | 0.595129 | -0.007368 | 0.611441 | -0.209847 | 0.456022 | -0.494588 | 1.000000 | 0.910228 | 0.464741 | -0.444413 | 0.488676 | -0.381626 |
TAX | 0.582764 | -0.314563 | 0.720760 | -0.035587 | 0.668023 | -0.292048 | 0.506456 | -0.534432 | 0.910228 | 1.000000 | 0.460853 | -0.441808 | 0.543993 | -0.468536 |
PTRATIO | 0.289946 | -0.391679 | 0.383248 | -0.121515 | 0.188933 | -0.355501 | 0.261515 | -0.232471 | 0.464741 | 0.460853 | 1.000000 | -0.177383 | 0.374044 | -0.507787 |
B | -0.385064 | 0.175520 | -0.356977 | 0.048788 | -0.380051 | 0.128069 | -0.273534 | 0.291512 | -0.444413 | -0.441808 | -0.177383 | 1.000000 | -0.366087 | 0.333461 |
LSTAT | 0.455621 | -0.412995 | 0.603800 | -0.053929 | 0.590879 | -0.613808 | 0.602339 | -0.496996 | 0.488676 | 0.543993 | 0.374044 | -0.366087 | 1.000000 | -0.737663 |
Price | -0.388305 | 0.360445 | -0.483725 | 0.175260 | -0.427321 | 0.695360 | -0.376955 | 0.249929 | -0.381626 | -0.468536 | -0.507787 | 0.333461 | -0.737663 | 1.000000 |
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="Greys", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation")
plt.show()
df.duplicated().sum()
0
plt.figure(figsize=(8, 6))
sns.histplot(df['Price'], bins=30, kde=True, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
C:\Users\Yasser\New folder\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['RM'], y=df['Price'], color='blue')
plt.title('Price vs. Number of Rooms')
plt.xlabel('RM')
plt.ylabel('Price')
plt.show()
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['CHAS'], y=df['Price'], palette='coolwarm')
plt.title('Prices Based on CHAS')
plt.xlabel('CHAS (1 = Near River, 0 = Not Near River)')
plt.ylabel('Price')
plt.show()
C:\Users\Yasser\New folder\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_vals = vals.groupby(grouper)
df.describe()
CRIM | ZN | INDUS | NOX | RM | AGE | DIS | TAX | PTRATIO | B | LSTAT | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 |
mean | 3.613524 | 11.363636 | 11.136779 | 0.554695 | 6.284634 | 68.574901 | 3.795043 | 408.237154 | 18.455534 | 356.674032 | 12.653063 | 22.532806 |
std | 8.601545 | 23.322453 | 6.860353 | 0.115878 | 0.702617 | 28.148861 | 2.105710 | 168.537116 | 2.164946 | 91.294864 | 7.141062 | 9.197104 |
min | 0.006320 | 0.000000 | 0.460000 | 0.385000 | 3.561000 | 2.900000 | 1.129600 | 187.000000 | 12.600000 | 0.320000 | 1.730000 | 5.000000 |
25% | 0.082045 | 0.000000 | 5.190000 | 0.449000 | 5.885500 | 45.025000 | 2.100175 | 279.000000 | 17.400000 | 375.377500 | 6.950000 | 17.025000 |
50% | 0.256510 | 0.000000 | 9.690000 | 0.538000 | 6.208500 | 77.500000 | 3.207450 | 330.000000 | 19.050000 | 391.440000 | 11.360000 | 21.200000 |
75% | 3.677083 | 12.500000 | 18.100000 | 0.624000 | 6.623500 | 94.075000 | 5.188425 | 666.000000 | 20.200000 | 396.225000 | 16.955000 | 25.000000 |
max | 88.976200 | 100.000000 | 27.740000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 711.000000 | 22.000000 | 396.900000 | 37.970000 | 50.000000 |
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
x =df.drop(columns=['Price'])
y =df['Price']
scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)
correlation_matrix = pd.DataFrame(x_scaled, columns=x.columns).corr().abs()
high_correlation_features = correlation_matrix.columns[correlation_matrix.mean() > 0.8]
x_filtered = pd.DataFrame(x_scaled, columns=x.columns).drop(columns=high_correlation_features, axis=1)
poly = PolynomialFeatures(degree=2, include_bias=False)
x_poly = poly.fit_transform(x_filtered)
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)
model = LinearRegression()
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='r2')
model.fit(x_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")
R²: 0.8055829447972154
predicted = y_pred
expected = y_test
for p, e in zip(predicted[::5], expected[::5]): # Check for preforming result
print(f'predicted: {p:.2f}, expected: {e:.2f}')
predicted: 27.02, expected: 23.60 predicted: 16.73, expected: 20.00 predicted: 18.04, expected: 21.50 predicted: 25.02, expected: 29.80 predicted: 24.70, expected: 25.20 predicted: 18.48, expected: 13.80 predicted: 20.77, expected: 23.10 predicted: 17.67, expected: 19.40 predicted: 16.83, expected: 19.70 predicted: 17.70, expected: 15.10 predicted: 22.38, expected: 18.90 predicted: 23.74, expected: 21.20 predicted: 23.35, expected: 25.00 predicted: 15.13, expected: 20.00 predicted: 21.63, expected: 20.00 predicted: 8.16, expected: 5.00 predicted: 17.71, expected: 20.90 predicted: 28.73, expected: 29.00 predicted: 26.02, expected: 23.20 predicted: 22.49, expected: 23.00 predicted: 23.91, expected: 22.50
d = pd.DataFrame({"Expected": y_test, "Predicted": y_pred})
import matplotlib.pyplot as plt
import seaborn as sns
figure = plt.figure(figsize=(9,9))
axes = sns.scatterplot(data=d, x="Expected",y="Predicted", hue="Predicted",
palette="cool", legend=False)
start = min(expected.min(), predicted.min())
end = max(expected.max(), predicted.max())
axes.set_xlim(start, end)
axes.set_ylim(start, end)
line = plt.plot([start, end], [start, end], "r--")