Python 人工智能入门

Numpy#

练习#

参考官⽅⽹址：https://numpy.org/doc/stable/user/index.html

初步例⼦#

numpy 是数据科学中的基础库包，经常和其他库包使⽤

1
import numpy as np
2
import matplotlib.pyplot as plt
3
x=np.arange(-np.pi,np.pi,0.01) # 产⽣ -pi 到 pi 之间数，步⻓为 0.01
4
y=np.sin(x)
5
plt.plot(x,y)
6
plt.show()

1
import numpy as np
2
import matplotlib.pyplot as plt
3
x=np.random.randn(1000) # 产⽣ 1000 个浮点数，满⾜标准正态分布（均值为 0 ，⽅差为 1 ）
4
print('mean=',np.mean(x),"std=",np.std(x))# 输出平均数和⽅差
5
plt.hist(x,bins=50) # 画柱状图
6
plt.show()

导⼊ numpy 模块#

1
import numpy as np # 导⼊ numpy 包，起 np 的别名

善于使⽤帮助⾮常重要！！

输⼊ np. 后按 tab 键，可显示所有的属性和⽅法，或在.后多敲⼏个字⺟再按 tab 键，可以缩⼩候选数
查看属性和⽅法的具体使⽤可以⽤help函数，例如：
help(np.array)

1
# help(np)
2
help(np.array) # 查看相关帮助信息 , 在 cell 中显示

通过 Python 数据集构造 NumPy 的 ndarray 数组对象#

1
import numpy as np
2
# 构造⼀维数组
3
a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float) # 指定数组元素类型为 float
4
print(type(a1)) # a1 的类型为 numpy.ndarray
5
print('a1=',a1,'\n')
6
# 构造⼆维数组
7
a2=np.array([[1,2,3,4,5],[6,7,8,9,10]]) # 默认类型为最⼩存放给定数据的的数据类型
8
print('a2=',a2)

特殊数组构造#

1
import numpy as np
2
# 在指定的间隔范围内 [1,10], 返回均匀间隔的 30 个数字
3
al=np.linspace(1,10,30)
4
print(al)
5
# 构造全 0 的数组
6
a0=np.zeros(10,dtype=int)
7
print('a0=',a0,'\n')
8
# 构造全 0 的矩阵
9
a0=np.zeros((3, 3)) # 不指定类型则为浮点数
10
print(a0,'\n')
11
# 从 0 开始，到 10 结束 ( 不包括 ) ，步⻓为 2 ，不写默认为 1
12
a1=np.arange(0,10,2)
13
print('a1=',a1,'\n')
14
# 构造全 1 的数组
15
a2=np.ones((3,5),dtype=float)
16
print('a2=',a2,'\n')
17
# 构造全 3.14 的数组
18
a3=np.full((3,5),3.14)
19
print("a3=",a3,'\n')
20
# 产⽣随机阵
21
a4=np.random.random([5, 5])
22
print("a4=",a4,'\n')
23
# 创建⼀个 3*3 的， [0 ， 10) 区间的随机整型数
24
a5=np.random.randint(0,10,(3,3))
25
print("a5=",a5,'\n')
26
# 创建⼀个 3*3 的单位矩阵
27
a6=np.eye(3)
28
print("a6=",a6,'\n')

随机数产⽣#

1
import numpy as np
2
# 参考⽹址： https://blog.csdn.net/u012149181/article/details/78913167
3
np.random.rand(3,3) # 产⽣每个值在 [0,1) 范围的 3X3 矩阵
4

5
np.random.randint(1,5,[3,3]) # 产⽣最⼩值为 1 ，最⼤值不超过 5 的 3X3 的随机矩阵
6

7
np.random.randn(3,3) # 产⽣符合正态分布的 3X3 的随机矩阵
8

9
# 产⽣随机阵
10
a4=np.random.random([5, 5])# 产⽣值在 (0,1) 之间 5X5 随机矩阵
11
print("a4=",a4,'\n')
12
print(a4.ndim) # 显示维数
13
print(a4.shape) # 显示维数形状

数组维度改变#

1
import numpy as np
2
# 改变⾏列
3
a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float)
4
a2=a1.reshape(2,5)
5
print(a2,'\n')
6
# 展平数组，即变成⼀维数组 或使⽤ a2.ravel()
7
a3=a2.flatten()
8
print(a3,'\n')
9
# 数组维数
10
print(a2.shape)
11
print(a3.shape)
12
a3.shape=(5,2)
13
print(a3.shape)

1
array1=np.array([[[0,1,2,3],
2
[4,5,6,7],
3
[8,9,10,11]],
4
[[12,13,14,15],
5
[16,17,18,19],
6
[20,21,22,23]]])
7
print(array1.shape)
8
array1.shape=(6,4)
9
print(array1)

1
# 转置
2
a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float)
3
a2=a1.reshape(2,5)
4
print(a2)
5
a3=a2.T
6
print(a3)

基本运算#

1
# 基本统计信息
2
print(a3.max())# 求最⼤值
3
print(a3.mean()) # 求平均值，
4
# mean 函数调⽤格式 ndarray.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)

1
# 数组整体运算
2
a2=np.arange(10).reshape(2,5)
3
max=a2.max()
4
min=a2.min()
5
a2 = (a2-min)/(max-min) # 数组中每个元素与 min ， max 参与运算
6
print(a2)
7
In [ ]:
8
import numpy as np
9
X=np.random.randint(0,10,[3,3])
10
print(X)
11
Y = X - X.mean(axis=1, keepdims=True) # 实现每⾏上数据减去这⾏的平均值
12
# mean 函数调⽤格式 ndarray.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)
13
print(Y)

1
# 产⽣随机数
2
xlist=np.random.randint(10, size=(10))
3
ylist=np.random.rand(10)
4
ylist

1
# 通⽤函数 ufunc ⼀元函数
2
import numpy as np
3
a=np.array([1,-1,2,-2,3])
4
a1=np.abs(a)
5
print(a1)
6
a2=np.square(a)
7
print(a2)

1
# 通⽤函数 ufunc ⼆元函数
2
a=np.array([1,-1,2,-2,3])
3
b=np.arange(1,6)
4
c=np.add(a,b)
5
print(c)
6
d=np.subtract(a,b)

矩阵运算#

1
# 复杂运算
2
# 堆叠数组
3
help(np.hstack)
4
help(np.vstack)
5
# 拆分数组
6
help(np.hsplit)
7
help(np.vsplit)
8
help(np.split)

1
help(np.mat)

1
A=np.mat("1 2 3;4 5 6") # 构造 2*3 的矩阵
2
print(A)
3
B=np.array([[1, 2],[3,4], [5, 6]])# 构造 3*2 的矩阵
4
print(B)
5
a=[[1,2,3],[4,5,6],[7,8,9]] # 构造 3*3 的矩阵
6
C=np.array(a)
7
print(C)

1
D=A.dot(B) # 矩阵运算
2
print(D)

1
a=[[1,2,3],[4,5,6],[7,8,9]]
2
arr3=np.array(a)
3
tt=np.tril(arr3) # 下三⻆
4
print(tt)
5
tt=np.triu(arr3) # 上三⻆
6
print(tt)

1
# np.mgrid(start ： end ： step) # ⽣成等差数组， [start ： end) 闭包关系
2
import numpy as np # 导⼊ numpy 模块
3
a = np.mgrid[1:4:1] # ⽣成等差数组 a
4
# np.mgrid[0:5,0:5] # 按⾏递增，然后按列递增，两个数组再堆叠起来
5
print(a)
6
# help(np.mgrid)

任务题#

1.安装 Numpy 工具包

1
sudo apt install python-numpy
2
sudo pip3 install numpy

2.创建一个长度为10的一维全为0的ndarray对象，然后让第5个元素等于1

1
import numpy as np
2

3
array = np.zeros(10)
4
array[4] = 1
5
print(array)

3.创建一个元素为从10到49的ndarray对象,并将所有元素位置反转

1
import numpy as np
2

3
array = np.arange(10, 49+1)
4
array = np.flipud(array)
5
print(array)

4.使用np.random.random创建一个10*10的ndarray对象，并打印出最大最小元素

1
import numpy as np
2

3
array = np.random.random((10,10))
4

5
print("max:{}".format(array.max()))
6
print("min:{}".format(array.min()))

5.正则化一个5*5随机矩阵.(正则的概念：假设a是矩阵中的一个元素，max/min分别是矩阵元素的最大最小值，则正则化后a = (a-min)/(max-min) )

1
import numpy as np
2

3
array = np.random.random((5,5))
4
print("origin:")
5
print(array)
6
print(20*"-")
7
array_min = array.min()
8
array_max = array.max()
9
array = (array-array_min) / (array_max - array_min)
10
print("正则化：")
11
print(array)

Pandas#

安装 Pandas#

可以使用如下指令安装 pandas：

1
pip3 install pandas

如果安装超时或失败，可以尝试使用清华源：

1
pip3 install pip3 -U
2
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/

练习#

导入Pandas库包#

官⽅参考⽹站：
https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html#
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
Pandas 中有两个重要的数据结构： series 和 DataFrame 。在使⽤ Pandas 之前，需要理解这两种数据结构的相关知识。

1
# 导⼊ pandas 库包为 pd 名字
2
import pandas as pd
3
# 导⼊ numpy 库包为 np 名字
4
import numpy as np

任务题#

读取 goods.csv 文件中的行数和列数，并输出前10行的内容

1
import pandas as pd
2
import numpy as np
3

4
df = pd.read_csv("./goods.csv", sep=',')# sep默认为','
5
print(df.head(10))

打印全部列名，打印数据集的索引

1
import pandas as pd
2
import numpy as np
3

4
df = pd.read_csv("./goods.csv")
5
print(df.columns)#全部列名
6
print(df.index)#数据集的索引

找出 goods 数据集 item_name 列中，一共有多少种商品被下单，被下单数最多的商品（item）是什么？

1
import pandas as pd
2
import numpy as np
3

4
df = pd.read_csv("./goods.csv")
5

6
print(pd.unique(df['item_name']).size) #一共有多少种商品被下单
7

8
print(df['item_name'].value_counts().head(1)) #被下单数最多的商品（item）是什么

在 choice_description 中，销售量最多的商品是什么？

1
import pandas as pd
2
import numpy as np
3

4
df = pd.read_csv("./goods.csv")
5
print(df['choice_description'].value_counts().head(1))

将 item_price 转换为浮点型，并计算总收入

1
import pandas as pd
2
import numpy as np
3

4
df = pd.read_csv("./goods.csv")
5
f = lambda x:float(x[1:-1])
6
df['item_price'] = df['item_price'].apply(f)
7

8
df['sub_total'] = round(df['item_price'] * df['quantity'])
9
print(df['sub_total'].sum())

测试题#

1.1 泰坦尼克乘客数据.csv

1
# 1. 导入Pandas库包为pd
2
import pandas as pd
3
# 2. 使用pandas读csv格式文件‘泰坦尼克乘客数据.csv'，读的结果给titantic变量
4
titantic = pd.read_csv('./泰坦尼克乘客数据.csv')
5
# 3. 显示前5行数据
6
print(titantic.head(5))
7
# 4. 显示后4行数据
8
print(titantic.tail(4))
9
# 5. 查看数据集摘要信息
10
print(titantic.info())
11
# 6. 取数据集的'pclass','age','sex' 列，作为X的数据（即赋值给X）
12
X = titantic[['pclass','age','sex']]
13
# 7. 取数据集的'survived'列，作为Y的数据
14
Y = titantic['survived']
15
# 8. 将X数据中'age'列中缺失的数据用'age'列的平均值替换
16
X['age'].fillna(X['age'].mean(), inplace=True)
17
# 9. 显示替换后数据X的摘要信息
18
print(X.info())

1.2 breast-cancer-wisconsin.data

1
# 1. 导入库包numpy 为np
2
import numpy as np
3
# 2. 导入库包pandas 为pd
4
import pandas as pd
5
# 3. 给定读入数据的列名
6
column_names=['Sample code number','Clump Thickness','Uniformity of Cell Size',
7
'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',
8
'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
9
# 4. 按给定的列名读文件数据，并赋值给data变量
10
# 数据文件在https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
11
# 或从本地data文件夹下读该‘breast-cancer-wisconsin.data'文件
12
data = pd.read_csv("./breast-cancer-wisconsin.data", header=None, names=column_names)
13
print(data)
14
# 5. 显示数据集的前4行数据
15
print(data.head(4))
16
# 6. 查看数据集摘要信息
17
print(data.info())
18
# 7. 将数据中所有'?'的数据用np.nan值替换
19
data = data.replace(to_replace='?',value=np.nan)
20
# 8. 删除数据中值为np.nan的数据用，删除方式是：If any NA values are present, drop that row or column.
21
data = data.dropna(how='any',axis=0)
22
# 9. 显示删除缺失值后的数据集信息
23
print(data.info())
24
# 10. 查看数据集的维数，即行列数
25
print(data.shape)

2.1 水质参数.xlsx

1
# 1. 导入Pandas库包为pd
2
import pandas as pd
3
# 2. 利用Pandas读Excel文件‘水质参数.xlsx’，得数据集data
4
# bug:ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
5
data = pd.read_excel("水质参数.xlsx")
6
# 3. 打印方式查看数据统计特性
7
print(data.describe())
8
# 4. 显示前4行数据
9
print(data.head(4))
10
# 5. 取'溶解氧(DO)'列数据，形成一个Series，并将其值values取出赋给y
11
y = pd.Series(data['溶解氧(DO)']).values
12
# 6. 将数据集data'溶解氧'所在的列删除，赋给x
13
x = data.drop(['溶解氧(DO)'], axis=1)
14
# 7. 显示x的值
15
print(x.values)

2.3 data2.xlsx

1
# 1. 导入库包numpy 为np，导入库包pandas 为pd
2
import numpy as np
3
import pandas as pd
4
# 2. 利用Pandas读Excel文件‘data2.xlsx’，得数据集data
5
data = pd.read_excel("./data2.xlsx")
6
# 3. 显示数据集data的统计特性
7
print(data.describe())
8
# 4. 显示前5行数据
9
print(data.head(5))
10
# 5. 取溶解氧列数据，形成一个Series，并将其值取出赋给y
11
y = pd.Series(data['溶解氧']).values
12
# 6. 将数据集data溶解氧所在的列删除，并将删除后数据集的values赋给x
13
x = data.drop(['溶解氧'], axis=1)
14
# 7. 显示x的值
15
print(x.values)

3.3 data.csv

1
# 1. 导入库包numpy 为np，导入库包pandas 为pd
2
import numpy as np
3
import pandas as pd
4
# 2. 利用Pandas读csv文件‘data.csv’，得数据集dataset，第一行数据为列名，即选取文件的第一行作为表头,第一列作为index
5
dataset = pd.read_csv("./data.csv", index_col=0)
6
# 3. 显示数据集dataset的统计特性
7
print(dataset.describe())
8
# 4. 取dataset的values，赋给values变量
9
values = dataset.values
10
# 5. 确保values所有数据都是float（实际就是转换values中数据为float型），并重新赋值给values
11
values = values.astype(float)
12
# 6. 显示values的数据
13
print(values)

Matplotlib#

安装 Matplotlib#

Ubuntu 下安装 Matplotlib 有两种方法：

1
pip3 install matplotlib 或
2
sudo apt install python3-matplotliab

如果官方源下不动，将 python 源改为清华源。

任务题#

给定横坐标为x=[1,2,3,4],纵坐标为a=[1,4,5,9] ,b = [2,8,6,3]，将其绘制在同一张图中标签分别为 LineA 和 LineB，其中LineA 为红色折线图，LineB 为蓝色点线图，并设置图名为 Graph1。

1
import matplotlib.pyplot as plt
2

3
x = [1, 2, 3, 4]
4
a = [1, 4, 5, 9]
5
b = [2, 8, 6, 3]
6
plt.plot(x, a, 'r')
7
plt.plot(x, b, 'b', linestyle=':')
8
plt.title("Graph1")
9
plt.show()

2.使用 Numpy 和 Matplotlib 绘制1-11范围内的函数曲线 y=x*x+1。并设置图名为 Graph2，横轴和纵轴名分别为 x 和与 y 。

1
import matplotlib.pyplot as plt
2
import numpy as np
3

4
x = np.arange(1, 11)
5
y = x * x + 1
6
plt.plot(x, y)
7
plt.title("Graph2")
8
plt.show()

3.使用 matplotlib 绘制绘制（1,2,3,4,5）及其平方的的散点图。设置点的大小为40，颜色为红色，并删除数据点轮廓横轴和纵轴名分别为 x 和与 y （字体大小设为14）,并设置图名为 Graph3（字体大小设为24）。

1
import matplotlib.pyplot as plt
2
import numpy as np
3

4
x = np.arange(1, 6)
5
y = x * x
6
plt.scatter(x, y, s=40, c='red')
7
plt.xlabel("x", {'size':14,})
8
plt.ylabel("y", {'size':14,})
9
plt.title("Graph3", {'size':24,})
10
plt.show()

4.给定两对数据y1=[10,25,30],x1=[1,3,5],y2=[50,15,80],x2=[2,4,6]。其中 x1, x2 为横坐标 y1, y2 为纵坐标。分别绘制条形图其中 x1 的条形图为绿色，x2 的条形图为红色，横轴和纵轴名分别为 x 和 y ,并设置图名为 Graph4。

1
import matplotlib.pyplot as plt
2

3
y1 = [10, 25, 30]
4
x1 = [1, 3, 5]
5
y2 = [50, 15, 80]
6
x2 = [2, 4, 6]
7
plt.bar(x1, y1, color="green")
8
plt.bar(x2, y2, color="red")
9
plt.title("Graph4")
10
plt.show()

5.给定一组数 [22,87,5,43,56,73,55,54,11,20,51,5,79,31,27] 分布于 0-100，请使用 hist 函数以 20 为间隔绘制频数统计图，并设置图名为 Graph5 。

1
import matplotlib.pyplot as plt
2
import numpy as np
3

4
y = [22, 87, 5, 43, 56, 73, 55, 54, 11, 20, 51, 5, 79, 31, 27]
5
plt.hist(y, np.arange(0, 101, 20))
6
plt.show()

6.某班级同学参加三类活动A,B,C分别有15，20,35人画出各类活动的饼状图。图像为正视图，各类活动显示百分比（精确到小数点后两位），并设置图名为 Graph6 。

1
import matplotlib.pyplot as plt
2

3
labels = ['A', 'B', 'C']
4
sizes = [15, 20, 35]
5
explode = (0, 0, 0)
6
plt.pie(sizes, explode, labels, autopct='%1.1f%%')
7
plt.title("Graph6")
8
plt.show()

7.利用数据 x = [1,2,3,4,5]， y = [1,2,4,3,5]，画出如下图形，其中，第1个 axes 标题大小为 12 ，总图形标题大小为 20 。

1
import matplotlib.pyplot as plt
2

3
x = [1, 2, 3, 4, 5]
4
y = [1, 2, 4, 3, 5]
5
fig, axes = plt.subplots(1, 3, figsize=(14,4))
6
ax1, ax2, ax3 = axes[0], axes[1], axes[2]
7
ax1.plot(x, y)
8
ax2.scatter(x, y)
9
ax3.barh(x, y)
10
ax1.set_title("ax1 title", fontsize=12)
11
fig.suptitle("figure title <subplots test>",fontsize=20)
12
plt.show()

8.读取data目录下的 iris.csv 文件中的数据，画出不同种类（species）鸢尾花萼片和花瓣的大小关系（分类散点子图）。

1
# 0. 导⼊所需库包
2
import pandas as pd
3
import matplotlib.pyplot as plt
4
# 1. 利⽤ pandas 读 iris.csv ⽂件
5
df=pd.read_csv('./iris.csv')
6
# 2. 显示读出数据的前 5 ⾏
7
print(df.head())
8
# 3. 查看数据的概要信息
9
df.info()
10
# 4. 计算 sepal 尺⼨ =sepal_length*sepal_width
11
df['sepal_size']=df['sepal_length']*df['sepal_width']
12
df['petal_size']=df['petal_length']*df['petal_width']
13
# 5. 取出所有种类 species 名称
14
species=df['species'].unique()
15
# 6. 取出每类数据
16
data1=df[df['species']==species[0]]
17
data2=df[df['species']==species[1]]
18
data3=df[df['species']==species[2]]
19
# 7. 对每类数据画散点图
20
fig, ax2_2 = plt.subplots()
21
ax2_2.scatter(data1['sepal_size'],data1['petal_size'],color = '#ff0000',label=species[0])
22
ax2_2.scatter(data2['sepal_size'],data2['petal_size'],color = '#00ff00',label =species[1])
23
ax2_2.scatter(data3['sepal_size'],data3['petal_size'],color = '#0000ff',label=species[2])
24
# 8. 添加图例
25
ax2_2.legend(loc = 'best')
26
# 9. 添加标题和坐标说明
27
ax2_2.set_title('Size of Sepal vs Size of Petal')
28
ax2_2.set_xlabel('size of sepal')
29
ax2_2.set_ylabel('size of petal')
30
# 10. 显示图形
31
plt.show()

SKlearn#

安装SKlearn#

Ubuntu 下安装 SKlearn 有两种方法：

1
pip3 install SKlearn 或
2
sudo apt install python3-SKlearn

如果官方源下不动，可以查看人工智能 Pandas，将python源改为清华源。

任务题#

KMeans#

1
# Created On 2021.10. By 摩羯(whitelot@163.com)
2
# https://zodiaclab.top/
3
# 使用PyCharm编译器创建Python程序，搭建K-Means算法处理实现鸢尾花数据的聚类问题。使用K-Means算法对鸢尾花的数据进行聚类并将聚类结果进行输出。
4

5
from sklearn import datasets
6
from sklearn.cluster import KMeans
7
import pandas as pd
8
import matplotlib.pyplot as plt
9

10
# 从sklearn中下载鸢尾花数据集
11
iris = datasets.load_iris()
12
iris_data = iris.data
13
iris_target = iris.target
14
label0 = {"x":[], "y":[]}
15
label1 = {"x":[], "y":[]}
16
label2 = {"x":[], "y":[]}
17
for index in range(len(iris_target)):
18
if iris_target[index] == 0:
19
label0["x"].append(iris_data[index][-2])
20
label0["y"].append(iris_data[index][-1])
21
elif iris_target[index] == 1:
22
label1["x"].append(iris_data[index][-2])
23
label1["y"].append(iris_data[index][-1])
24
elif iris_target[index] == 2:
25
label2["x"].append(iris_data[index][-2])
26
label2["y"].append(iris_data[index][-1])
27
plt.scatter(label0["x"], label0["y"], c="r", marker='o')
28
plt.scatter(label1["x"], label1["y"], c="g", marker='*')
29
plt.scatter(label2["x"], label2["y"], c="b", marker='+')
30
plt.legend(["setosa", "versicolor", "virginica"])
31
plt.title("Real", fontsize=20)
32
plt.xlabel("petal length (cm)", fontsize=12)
33
plt.ylabel("petal width (cm)", fontsize=12)
34
plt.show()
35
# df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
36
# df_iris['species'] = iris.target
37
# for index in range(df_iris.shape[0]):
38
# if df_iris['species'][index] == 0:
39
# df_iris['species'][index] = "setosa"
40
# elif df_iris['species'][index] == 1:
41
# df_iris['species'][index] = "versicolor"
42
# elif df_iris['species'][index] == 2:
43
# df_iris['species'][index] = "virginica"
44
# df_iris.to_csv('./iris.csv', index=None)
45

46
# df_iris.info()
47

48
# plt.scatter(df_iris["petal length (cm)"], df_iris["petal width (cm)"])
49
# plt.xlabel("petal length (cm)", fontsize=12)
50
# plt.ylabel("petal width (cm)", fontsize=12)
51
# plt.show()
52

53

54
X = iris.data[:, 2:] # 只取特征空间中的后两个维度
55

56
estimator = KMeans(n_clusters=3) # 构造聚类器
57
estimator.fit(X) # 聚类
58
label_pred = estimator.labels_
59
# 按聚类后标签进行分类
60
label0 = {"x":[], "y":[]}
61
label1 = {"x":[], "y":[]}
62
label2 = {"x":[], "y":[]}
63
for index in range(len(label_pred)):
64
if label_pred[index] == 0:
65
label0["x"].append(X[index][0])
66
label0["y"].append(X[index][1])
67
elif label_pred[index] == 1:
68
label1["x"].append(X[index][0])
69
label1["y"].append(X[index][1])
70
elif label_pred[index] == 2:
71
label2["x"].append(X[index][0])
72
label2["y"].append(X[index][1])
73
plt.scatter(label0["x"], label0["y"], c="r", marker='o')
74
plt.scatter(label1["x"], label1["y"], c="g", marker='*')
75
plt.scatter(label2["x"], label2["y"], c="b", marker='+')
76
plt.legend(["setosa", "versicolor", "virginica"])
77
plt.title("KMeans", fontsize=20)
78
plt.xlabel("petal length (cm)", fontsize=12)
79
plt.ylabel("petal width (cm)", fontsize=12)
80
plt.show()

逻辑回归#

1
# Created On 2021.10. By 摩羯(whitelot@163.com)
2
# https://zodiaclab.top/
3
# 使用PyCharm编译器创建Python程序，搭建并训练逻辑回归分类器处理鸢尾花分类问题。使用已训练的分类器对测试集中的鸢尾花数据进行分类并对分类结果进行多性能指标评估。
4

5
from sklearn import datasets
6
from sklearn.linear_model import LogisticRegression
7
from sklearn.model_selection import train_test_split
8
from sklearn.preprocessing import MinMaxScaler
9
from sklearn.metrics import classification_report
10

11
iris = datasets.load_iris()
12
X = iris.data
13
Y = iris.target
14

15
# 数据预览
16
print("样本数据:")
17
print(X[:10])
18
print("标签数据:")
19
print(Y[:10])
20

21
# 按照8:2的比例划分为训练集和测试集
22
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)
23
print("Length of train {}, Length of test {}".format(len(x_train), len(x_test)))
24

25
# 极大极小值归一化
26
MinMax_x = MinMaxScaler()
27
x_train = MinMax_x.fit_transform(x_train)
28
x_test = MinMax_x.transform(x_test)
29

30
# 构建逻辑回归模型
31
lr = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='multinomial')
32

33
# 模型训练
34
mode=lr.fit(x_train, y_train)
35

36
# ⽤评估器的 score 函数评估模型
37
accuracy=lr.score(x_test,y_test)
38
print('准确度为：%.3f' % accuracy)
39

40
# 预测
41
y_pre=mode.predict(x_test)
42

43
target_names = ["setosa", "versicolor", "virginica"]
44
print(classification_report(y_test, y_pre, target_names=target_names))

线性分类器#

1
# Created On 2021.10. By 摩羯(whitelot@163.com)
2
# https://zodiaclab.top/
3
# 使用PyCharm编译器创建Python程序，搭建并训练线性分类器处理良恶性乳腺癌肿瘤预测问题。使用已训练分类器对测试集中的肿瘤类别进行预测并对预测结果进行多性能指标评估。
4

5
import pandas as pd
6
import numpy as np
7
from sklearn.model_selection import train_test_split
8
from sklearn.preprocessing import StandardScaler
9
from sklearn.linear_model import LogisticRegression
10
from sklearn.linear_model import SGDClassifier
11
from sklearn.metrics import classification_report
12

13
# 数据统计特性
14
column_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
15
df = pd.read_csv("./data/breast-cancer-wisconsin.data", header=None, names=column_names)
16
print(df.info())
17

18
# 筛选补全数据
19
df = df.replace(to_replace='?',value=np.nan)
20
df = df.dropna()
21
print(df.info())
22

23
# 数据划分
24
x_train, x_test, y_train, y_test = train_test_split(df[column_names[0:10]], df[column_names[10]], test_size=0.25, shuffle=True)
25
print("训练集统计特性:")
26
print(x_train.info())
27
print("测试集统计特性:")
28
print(x_test.info())
29

30
# 使用StandardScaler（）函数将数据标准化
31
std_x = StandardScaler()
32
x_train = std_x.fit_transform(x_train)
33
x_test = std_x.transform(x_test)
34

35
# 初始化LogisticRegression与SGDClassifier
36
lr = LogisticRegression()
37
sgdc = SGDClassifier()
38

39
# lr模型训练与预测
40
lr.fit(x_train, y_train)
41
lr_y_predict = lr.predict(x_test)
42

43
# sgdc模型训练与预测
44
sgdc.fit(x_train, y_train)
45
sgdc_y_predict = sgdc.predict(x_test)
46

47
# score评分
48
lr_accuracy=lr.score(x_test,y_test)
49
print('LogisticRegression 确度为：%.3f' % lr_accuracy)
50
sgdc_accuracy=sgdc.score(x_test,y_test)
51
print('SGDClassifier 准确度为：%.3f' % sgdc_accuracy)
52

53
# 多性能指标评估
54
print("LogisticRegression:")
55
print(classification_report(y_test, lr_y_predict))
56
print("SGDClassifier:")
57
print(classification_report(y_test, sgdc_y_predict))

集成学习#

1
# Created On 2021.10. By 摩羯(whitelot@163.com)
2
# https://zodiaclab.top/
3
# 使用PyCharm编译器创建Python程序，搭建并使用随机森林处理泰坦尼克号乘客生存预测问题。导入数据和必要包，选取特征，补充缺失值并对预测结果进行多性能指标评估。
4

5
import pandas as pd
6
from sklearn.model_selection import train_test_split
7
from sklearn.feature_extraction import DictVectorizer
8
from sklearn.tree import DecisionTreeClassifier
9
from sklearn.ensemble import RandomForestClassifier
10
from sklearn.metrics import classification_report
11

12
# 数据统计特性
13
titanic = pd.read_csv("./data/titanic.csv")
14
print(titanic.info())
15

16
# 筛选特征补全数据
17
df = titanic[['pclass','age','sex', 'survived']]
18
df['age'].fillna(df['age'].mean(), inplace=True)
19
print(df.info())
20

21
# 数据划分
22
x_train, x_test, y_train, y_test = train_test_split(df[['pclass','age','sex']], df['survived'], test_size=0.25, shuffle=True)
23
print("训练集统计特性:")
24
print(x_train.info())
25
print("测试集统计特性:")
26
print(x_test.info())
27

28
vec = DictVectorizer()
29
x_train = vec.fit_transform(x_train.to_dict(orient='records'))
30
x_test = vec.transform(x_test.to_dict(orient='records'))
31
print(vec.feature_names_)
32

33
dtc = DecisionTreeClassifier()
34
dtc.fit(x_train, y_train)
35

36
accuracy=dtc.score(x_test,y_test)
37
print('准确度为：%.3f' % accuracy)
38

39
dt_predict = dtc.predict(x_test)
40
print(classification_report(y_test, dt_predict))
41

42
# RandomForestClassifier
43
rfc = RandomForestClassifier()
44
rfc.fit(x_train, y_train)
45

46
rfc_accuracy=rfc.score(x_test,y_test)
47
print('准确度为：%.3f' % rfc_accuracy)
48

49
rfc_predict = rfc.predict(x_test)
50
print(classification_report(y_test, rfc_predict))

多项式朴素贝叶斯#

这道题上可能有点问题，题目问题描述的比较模糊，就当参考看一下就行。

1
# Created On 2021.10. By 摩羯(whitelot@163.com)
2
# https://zodiaclab.top/
3
# 使用PyCharm编译器创建Python程序，搭建并创建贝叶斯模型实现影评观众情绪分类问题。将数据集按7:3的比例划分为训练集和测试集，使用朴素贝叶斯分类器对训练集进行训练，并使用测试集测试影评观众情绪分类的准确性。
4

5
import pandas as pd
6
from sklearn.model_selection import train_test_split
7
from sklearn.feature_extraction.text import CountVectorizer
8
from sklearn import naive_bayes
9
from sklearn import metrics
10
import numpy as np
11

12
df = pd.read_csv("./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
13

14
# 数据储存结构
15
print(df.info())
16

17
# 数据划分
18
x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, shuffle=True)
19

20
vectorizer = CountVectorizer()
21
features = vectorizer.fit_transform(x_train)
22
test_features = vectorizer.transform(x_test)
23

24
# 单词分类
25
#print(vectorizer.get_feature_names())
26

27
# 朴素⻉叶斯算法 NB
28
model = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
29
model.fit(features, y_train)
30

31
# 每一类对应的概率
32
probility = model.predict_proba(test_features)
33
print(probility)
34

35
# 准确率
36
y_predict = model.predict(test_features)
37
accuracy = metrics.accuracy_score(y_test, y_predict)
38
print('准确度为：%.3f' % accuracy)

Keras#

介绍#

Keras是一个高层神经网络API，由纯Python编写而成，并以Tensorflow、Theano以及CNTK为后端。

需要注意的是，在TensorFlow2.x中Keras已经成为其高级API。

所以你的项目无论是用Keras还是Tensorflow都差不了多少，总体上Keras会比Tensorflow简单一点。

官网：http://keras.io/

中文文档：http://keras-cn.readthedocs.io/en/latest/

任务题#

Keras基础#

定义一个多类别分类(multi-class classification)的多层感知器(MLP)模型。

该模型有784个输入，3个隐藏层，分别为512,216和128个隐藏神经元，输出层有10个输出。

在每个隐藏层中使用relu激活函数，并且在输出层中使用softmax激活函数进行多类别分类。

1
from keras.models import Sequential
2
from keras.layers import Dense
3

4
n_input = 784
5
n_hidden_1 = 512
6
n_hidden_2 = 216
7
n_hidden_3 = 128
8
n_classes = 10
9

10
model = Sequential()
11

12
model.add(Dense(n_hidden_1, activation="relu", input_dim=n_input))
13
model.add(Dense(n_hidden_2, activation="relu"))
14
model.add(Dense(n_hidden_3, activation="relu"))
15
model.add(Dense(n_classes, activation="softmax"))

定义一个用于图像分类的卷积神经网络(Convolutional neural network)。

该模型接收灰阶的28 * 28图像作为输入，然后有一个作为特征提取器的两个卷积层（卷积层）和池化层（选最大池化，池化窗口大小为2X2）的序列，即一个卷积层后面一个池化层，然后再一个卷积层后面一个池化层，两个卷积层的filters分别为128和64，激活函数选relu，kernel_size均为4X4，池化层,均选最大池化，池化窗口大小为2X2。

然后是一个完全连接层来解释特征，该全连接层有64个神经元，激活函数选relu，并且具有用于10类预测的softmax激活的输出层。

1
from tensorflow import keras
2
from tensorflow.keras import layers
3

4
input_shape = (28, 28, 1)
5
num_classes = 10
6

7
model = keras.Sequential(
8
[
9
keras.Input(shape=input_shape),
10
layers.Conv2D(128, kernel_size=(4,4), activation="relu"),
11
layers.MaxPooling2D(pool_size=(2,2)),
12
layers.Conv2D(64, kernel_size=(4,4), activation="relu"),
13
layers.MaxPooling2D(pool_size=(2,2)),
14
layers.Dense(64, activation="relu"),
15
layers.Dense(num_classes, activation="softmax")
16
]
17
)

定义一个长短期记忆(LSTM)递归神经网络用于图像分类。

该模型预期一个特征的784个时间步骤作为输入。

该模型具有单个LSTM隐藏层以从序列中提取特征，LSTM隐藏层的神经元数为128，接着是一个全连接层来解释LSTM输出，该全连接层的神经元数为128，激活函数选relu，接着是用于进行10类别预测的输出层，输出层激活函数选softmax。

1
from keras.models import Sequential
2
from keras.layers import LSTM, Dense
3

4
n_input = 784
5
num_classes = 10
6

7
model = Sequential()
8

9
model.add(LSTM(128, input_dim=n_input))
10
model.add(Dense(128, activation="relu"))
11
model.add(Dense(num_classes, activation="softmax"))

BP神经网络#

这里关于Adma有个版本问题，如果你是在Windows平台上请使用 “from keras.optimizers import Adam”。

如果是Mac或Linux可以使用如下代码中的两种Import方法。

1
import pandas as pd
2
from sklearn.model_selection import train_test_split
3
from sklearn.preprocessing import StandardScaler
4

5
from keras.models import Sequential
6
from keras.layers import Dense
7
from keras.optimizers import adam_v2
8
#from tensorflow.keras.optimizers import Adam
9
import matplotlib.pyplot as plt
10

11

12
# 利用pandas读'数据集有氨氮的数据.xlsx' 到data
13
data = pd.read_excel("./数据/数据集有氨氮的数据.xlsx")
14
# 取data溶解氧列数据给y
15
y =data["溶解氧"].values
16
# 将data数据删除溶解氧列后的结果赋给x
17
x = data.drop("溶解氧", axis=1).values
18
print(x)
19
# 利用sklearn.preprocessing 的 StandardScaler 标准化数据集
20
scaler = StandardScaler()
21
x = scaler.fit_transform(x)
22

23
# 利用 sklearn.model_selection 的 train_test_split 设定训练集和测试集，测试集占%20
24
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
25

26

27
# 创建keras的Sequential模型
28
model = Sequential()
29
# 搭建3层的BP神经网络的结构,units表示隐含层神经元数，input_dim表示输入层神经元数，
30
# activation表示激活函数
31
model.add(Dense(14, activation='sigmoid', input_dim=6))
32
model.add(Dense(1, activation='sigmoid'))
33

34
# 编译模型，参数loss表示损失函数，这里损失函数为mse，优化算法采用Adam，metrics表示训练集的拟合误差
35
model.compile(loss='mse', optimizer=adam_v2.Adam(learning_rate=0.01), metrics=['mape'])
36
# 显示模型摘要信息
37
model.summary()
38

39
# 模型训练
40
# 将训练集的x和y输入到BP神经网络进行训练，epoch表示训练次数，
41
# batch_size表示每次训练的训练集大小，此处为24）使用sklearn对输入数据进行极大极小归一化。
42

43
history = model.fit(x_train, y_train, batch_size=10, epochs=200)
44

45
# 评估模型
46
score = model.evaluate(x_test, y_test, verbose=0)
47
print("Test accuracy:", score[1])
48

49
# 取history中历史损失值
50
loss = history.history['loss']
51
# 计算轮次
52
epochs = range(len(loss))
53
# 利用轮次和损失值画线条图
54
plt.plot(epochs, loss, label="Train_loss")
55
# 显示图例
56
plt.legend()
57
plt.show()
58

59
# 利用x_test数据，进行测试集的预测
60
result = model.predict(x_test)
61
# 显示真实数据
62
print(y_test)
63
# 显示预测结果
64
print('测试集的预测结果为：', result)
65
#对预测结果和实际值进行可视化
66
plt.figure()
67
plt.plot(y_test, label='true data')
68
plt.plot(result, 'r:',label='predict')
69
plt.legend()
70
plt.show()

损失函数图

预测结果图

人工智能 2021考试题库#

由于Linux和Windows的路径不同，执行前需要先修改数据文件的路径。

Class One#

决策树模型#

1
# 根据上述要求补全下列代码：
2
import pandas as pd
3
titantic = pd.read_csv('./data/泰坦尼克乘客数据.txt')#1.读取数据⽂件 3分
4
titantic.head()
5
titantic.info()
6
X = titantic[['pclass','age','sex']]#2.筛选pclass，age，sex关键因素 2分
7
Y = titantic['survived']
8
#3，使⽤平均值对age的缺失数据进⾏补全 3分
9
#X['age'].fillna(X['age'].mean(),inplace=True) # 注意inplace=True不能少
10
X=X.fillna({'age':X['age'].mean()})
11
X.info()
12
# from sklearn.model_selection import KFold
13
from sklearn.model_selection import train_test_split
14
X_train,X_test,Y_tran,Y_test = train_test_split(X,Y,test_size=0.2,random_state=33) #4.划分数据集 2分
15
# random_state随机数种⼦：其实就是该组随机数的编号，在需要重复试验的时候，保证得到⼀组⼀样的随机数
16
X_train.info()
17
X_test.info()
18
from sklearn.feature_extraction import DictVectorizer
19
vec=DictVectorizer(sparse=True)#5.进⾏特征转换 4分 sparse=True 转换后的量是否为稀疏矩阵的形式
20
X_train = vec.fit_transform(X_train.to_dict(orient='records'))
21
X_test = vec.transform(X_test.to_dict(orient='records'))
22
print(X_train)
23
#6.初始化决策树模型 3分
24
from sklearn.tree import DecisionTreeClassifier
25
dtc = DecisionTreeClassifier()
26
#7.进⾏训练 3分
27
dtc.fit(X_train,Y_tran)
28
#8,进⾏预测并输出 4分
29
y_predict = dtc.predict(X_test)
30
print(y_predict)
31
#9.评估分类性能并输出 4分
32
from sklearn.metrics import classification_report
33
print (dtc.score(X_test,Y_test))
34
result=classification_report(Y_test,y_predict,target_names=['died','survived'])
35
print(result)

K近邻分类器#

1
import pandas as pd
2
import numpy as np
3
column_names=['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin']
4
data=pd.read_csv('./data/breast-cancer-wisconsin.data',names=column_names)#1.读取⽂件 3分
5
data.info()#2.查看数据统计特性 2分
6

7
data=data.replace(to_replace='?',value=np.nan) #3.将‘？’替换为标准缺失值表示 2分
8
data=data.dropna(how='any')#4.丢弃带有缺失值的数据（只要有⼀个维度有缺失） 3分
9
data.info()
10
data.shape
11

12
from sklearn.model_selection import train_test_split
13
X_train,X_test,Y_train,Y_test=train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.2,random_state=33)#5.数据集划分 2分
14
X_train.info()
15
X_test.info()
16

17
from sklearn.preprocessing import StandardScaler
18
from sklearn.neighbors import KNeighborsClassifier
19
ss=StandardScaler()#6.数据标准化 2分
20
X_train=ss.fit_transform(X_train)
21
X_test=ss.fit_transform(X_test)
22
knc=KNeighborsClassifier() #7.选择分类器 2分
23
knc.fit(X_train,Y_train)
24
y_predict=knc.predict(X_test)#8.预测结果 3分
25
print("预测结果：")
26
print(y_predict)
27
from sklearn.metrics import classification_report
28
print("Accuracy of K-Nearest Neighbor Classifier is",knc.score(X_test,Y_test))#9.预测性能评估 2分
29
print(classification_report(Y_test,y_predict,target_names=['Benign','Malignant']))

朴素贝叶斯分类器#

1
from sklearn.datasets import fetch_20newsgroups
2
news=fetch_20newsgroups(subset='train')
3
#1.设置数据路径 1分
4
print(len(news.data))
5
print(news.data[0])#2.打印数据信息 1分
6
from sklearn.model_selection import train_test_split
7
X_train,X_test,Y_train,Y_test=train_test_split(news.data,news.target, test_size=0.2,random_state=33)#3.对数据集进⾏划分 4分
8
from sklearn.feature_extraction.text import CountVectorizer
9
vec=CountVectorizer() #4.⽂本向量化 1分
10
X_train=vec.fit_transform(X_train) #5.训练集处理 1分
11
X_test=vec.transform(X_test) #6.测试集处理 1分
12
from sklearn.naive_bayes import MultinomialNB#7.导⼊朴素⻉叶斯 8分
13
mnb=MultinomialNB()#8.朴素⻉叶斯模型建⽴ 4分
14
mnb.fit(X_train,Y_train)#9.朴素⻉叶斯模型训练 2分
15
y_predict=mnb.predict(X_test)
16
#10.模型结果预测 2分
17
print(y_predict)
18
from sklearn.metrics import classification_report
19
print("The accuracy of Naive Bayes Classification is",mnb.score)#11.accuracy结果输出 1分
20
print(classification_report(Y_test,y_predict,target_names=news.target_names))#12.classification_report报告输出 1分

Class Two#

线性回归模型#

1
from sklearn.linear_model import LinearRegression
2
from sklearn.preprocessing import StandardScaler
3
from sklearn.model_selection import train_test_split
4
from sklearn.metrics import mean_squared_error,mean_absolute_error
5
import pandas as pd
6
import matplotlib.pyplot as plt
7
# 1、加载数据集 2分
8
data = pd.read_excel('./data/⽔质参数.xlsx')
9
# 2、查看数据统计特性 2分
10
print(data.info())
11
data.head()
12
# 3、分离输⼊数据和标签数据 4分 y = data['溶解氧(DO)'].values
13
x = data.drop(columns=['溶解氧(DO)'])
14
# 4、划分数据 3分
15
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
16
# 5、归⼀化处理 4分
17
std_x = StandardScaler()
18
x_train = std_x.fit_transform(x_train)
19
x_test = std_x.transform(x_test)
20
# 6、模型构造和训练 4 分
21
lr = LinearRegression()
22
lr.fit(x_train, y_train)
23
# 7、模型预测 2分
24
y_lr_predict = lr.predict(x_test)
25
# 模型评估结果
26
print("lr的均⽅误差为：", mean_squared_error(y_test,y_lr_predict)) #8、均⽅误差 4分
27
print("lr的平均绝对误差为：", mean_absolute_error(y_test,y_lr_predict)) #9、平均绝对误差 4分
28
# 结果可视化
29
plt.plot(y_test, 'r', label='true_data') #10 红⾊ 2分
30
plt.plot(y_lr_predict, 'b', label='predict')#11 蓝⾊ 2分
31
plt.legend()
32
plt.show()

支持向量机回归模型#

1
from sklearn.model_selection import train_test_split
2
from sklearn.datasets import load_boston
3
import matplotlib.pyplot as plt
4
dataset = load_boston()
5
print(dataset.DESCR) x = dataset.data # 1.导⼊所有特征变量 1分 y = dataset.target # 2.导⼊⽬标值（房价） 1分
6
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33)
7
from sklearn.preprocessing import StandardScaler
8
ss_x = StandardScaler() # 3.数据进⾏归⼀化处理 1分
9
ss_y = StandardScaler() # 4.数据进⾏归⼀化处理 1分
10
x_train = ss_x.fit_transform(x_train)
11
x_test = ss_x.transform(x_test)
12
y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) # 5.数据进⾏归⼀化处理 1分
13
y_test = ss_y.transform(y_test.reshape(-1, 1))# 6.数据进⾏归⼀化处理 1分
14
print(x_train)
15
from sklearn.svm import SVR
16
linear_svr = SVR(kernel='linear') # 7.选择线性核函数配置的⽀持向量 2分
17
linear_svr.fit(x_train, y_train) # 8.线性核函数配置的⽀持向量机进⾏回归训练 2分
18
linear_svr_y_predict = linear_svr.predict(x_test) # 9.测试样本进⾏预测 2分
19
rbf_svr = SVR(kernel='rbf') # 10.选择径向基核函数配置的⽀持向量机 2分
20
rbf_svr.fit(x_train, y_train) # 11.径向基核函数配置的⽀持向量机进⾏回归训练 2分
21
rbf_svr_y_predict = rbf_svr.predict(x_test) # 12.测试样本进⾏预测 2分
22
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
23
# 13.反归⼀化
24
ytre1=ss_y.inverse_transform(y_test) # 1分
25
ytre2=ss_y.inverse_transform(y_test) # 1分
26
ypre1=ss_y.inverse_transform(linear_svr_y_predict) # 1分
27
ypre2=ss_y.inverse_transform(rbf_svr_y_predict) # 1分
28
# 14.求预测值和真实值的均⽅误差
29
print ('The mean squared error of linear SVR is', mean_squared_error(ytre1, ypre1)) # 2分
30
print ('The mean absoluate error of linear SVR is', mean_absolute_error(ss_y.inverse_transform(y_test), ypre1)) # 2分
31
print ('The mean squared error of RBF SVR is', mean_squared_error(ytre2, ypre2)) # 2分
32
print ('The mean absoluate error of RBF SVR is', mean_absolute_error(ytre2, ypre2)) # 2分
33
# 可视化线性核函数配置的⽀持向量预测结果
34
plt.plot(ypre1, label='pre')
35
plt.plot(ytre1, label='true')
36
plt.legend()
37
plt.show()
38
# 可视化径向基核函数配置的⽀持向量预测结果
39
plt.plot(ypre2, label='pre')
40
plt.plot(ytre2, label='true')
41
plt.legend()
42
plt.show()

多层感知器#

1
from sklearn.model_selection import train_test_split
2
import matplotlib.pyplot as plt
3
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
4
from sklearn.preprocessing import MinMaxScaler
5
from sklearn.neural_network import MLPRegressor
6
import pandas as pd
7
import numpy as np#正确新建项⽬并建⽴Python⽂件并导⼊包（题1-2，两个步骤各⼀分）
8
dir = './data/data2.xlsx' #1.正确读取⽂件 1分
9
data = pd.read_excel(dir)#2.正确读取⽂件 1分
10
data.info()#3.正确读取⽂件 1分 y = data['溶解氧'].values
11
x = data.drop(['溶解氧'], axis=1).values
12
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)#4.正确划分数据集 2分
13
ss_x = MinMaxScaler() # 5.数据进⾏归⼀化处理 1分
14
ss_y = MinMaxScaler() # 6.数据进⾏归⼀化处理 1分
15
x_train = ss_x.fit_transform(x_train) #7.数据进⾏归⼀化处理 1分
16
x_test = ss_x.transform(x_test)
17
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
18
y_test = ss_y.transform(y_test.reshape(-1, 1))#8.数据进⾏归⼀化处理 1分
19
clf = MLPRegressor(max_iter=500)#9.正确初始化多层感知器 5分
20
clf.fit(x_train,y_train)#10.成功进⾏训练 3分
21
#11.成功进⾏预测
22
predict = clf.predict(x_test)#1分
23
predict = np.array(predict).reshape(len(predict),1)#2分
24
#12.正确评估均⽅误差
25
ytre = ss_y.inverse_transform(y_test)#1分
26
ypre = ss_y.inverse_transform(predict)#1分
27
print('The mean squared error of linear MLP is', mean_squared_error(ytre, ypre))#1分
28
print('The mean absolute error of linear MLP is', mean_absolute_error(ss_y.inverse_transform(y_test), ypre))#2分
29
#13正确绘制曲线图
30
plt.plot(ypre, label='pre') #1分
31
plt.plot(ytre, label='true')#1分
32
plt.legend()# 1分
33
plt.show()# 1分

Class Three#

卷积神经网络模型#

1
import numpy as np
2
from PIL import Image
3
from keras.preprocessing.image import ImageDataGenerator
4
from keras.models import Sequential, Model
5
from keras.layers import Dropout, Flatten, Dense
6
from keras.layers import Conv2D, MaxPooling2D
7
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
8
from keras.callbacks import ModelCheckpoint
9
from keras import applications
10
from keras import optimizers
11
train_data_dir = './data/3.1/train'#1.设置数据路径 1分
12
ClassNames = ['cat','dog'] #2.设置标签名称 1分(可选参数,为⼦⽂件夹的列表)
13
batchsize = 4 #3.设置batchsize⼤⼩ 1分
14
generator = ImageDataGenerator(
15
 rotation_range=10,
16
 width_shift_range=0.1,
17
 height_shift_range=0.1,
18
 zoom_range=0.1,
19
 rescale=1.0/255 #4.将图⽚的像素值归⼀化到0~1 1分? rescale的作⽤是对图⽚的每个像素值均乘上这个放缩因⼦
20
 )
21
train = generator.flow_from_directory(
22
 train_data_dir,
23
 target_size=(224,224),#5.调整图像分辨率 1分
24
 classes=ClassNames,
25
 shuffle=False,
26
 class_mode='binary', # 'categorical',#6.选择标签模式 1分 ?
27
 batch_size=batchsize)
28
#7.模型建⽴
29
model = Sequential()
30
model.add(Conv2D(4,(5,5),activation='relu',padding='same',input_shape=(224, 224, 3))) # 1分
31
model.add(MaxPooling2D(pool_size=(2,2))) # 1分
32
model.add(Conv2D(8,(3,3),padding='same',activation='relu')) # 1分
33
model.add(MaxPooling2D(pool_size=(2, 2))) # 1分
34
model.add(Conv2D(16,(3,3),padding='same',activation='relu')) # 1分
35
model.add(MaxPooling2D(pool_size=(2, 2))) # 1分
36
model.add(Flatten()) # Flatten层⽤来将输⼊“压平”，即把多维的输⼊⼀维化，常⽤在从卷积层到全连接层的过渡。
37
model.add(Dense(128,activation='relu')) # 1分
38
model.add(Dense(1,activation='sigmoid')) # 1分
39
rmsprop = RMSprop(learning_rate=0.0001)#8.设置学习率 2分
40
model.compile(loss='binary_crossentropy',optimizer=rmsprop,metrics=['accuracy'])#9.选择损失函数，优化器 2分
41
#10.编写回调函数，在训练时⽤于保存准确率最⾼模型为weight.h
42
best_model = ModelCheckpoint(filepath='weight.h',monitor='accuracy',verbose=1, save_best_only=True) # 2分
43
model.summary()
44
model.fit_generator(
45
 train,
46
 steps_per_epoch = 400//batchsize,
47
 epochs=15, #11.设置迭代次数 2分
48
 callbacks=[best_model])

VGG模型#

1
from keras.preprocessing.image import ImageDataGenerator
2
from keras.models import Sequential, Model
3
from keras.layers import Dropout, Flatten, Dense
4
from tensorflow.keras.optimizers import RMSprop, Adam, SGD, Adadelta
5
from keras.callbacks import ModelCheckpoint
6
from keras.applications.vgg16 import VGG16, preprocess_input
7

8
train_data_dir = './data/3.2/train'#1.设置数据路径 1分
9
ClassNames = ['crab','shrimp']#2.设置标签名称 1分
10
batchsize = 8 #3.设置batchsize⼤⼩ 1分
11
generator = ImageDataGenerator(
12
 rotation_range=10,
13
 width_shift_range=0.1,
14
 height_shift_range=0.1,
15
 zoom_range=0.1,
16
 rescale=1.0/255 #4.将图⽚的像素值归⼀化到0~1 1分
17
 )
18
train = generator.flow_from_directory(
19
 train_data_dir,
20
 target_size=(224,224),#5.调整图像分辨率 1分
21
 classes=ClassNames,
22
 shuffle=False,
23
 class_mode='binary',#'categorical',#6.选择标签模式 1分
24
 batch_size=batchsize)
25

26
#7.模型建⽴ 参数：模型权重使⽤imagenet的，不包括顶层，输⼊图像尺⼨
27
vgg16 = VGG16(weights='imagenet',include_top=False,input_shape=(224, 224, 3)) # 1分
28
for layer in vgg16.layers:
29
 layer.trainable = False # 1分
30

31
import tensorflow
32
last = vgg16.output # 1分 x = Flatten()(last)# 1分 x = Dense(256,activation='relu')(x) # 1分 x = Dropout(0.5)(x) x = Dense(256,activation='relu')(x) # 1分 x = Dropout(0.5)(x) x = Dense(1,activation='sigmoid')(x) # 1分
33
model = Model(inputs=vgg16.input,outputs=x) # 1分
34

35
best_model = ModelCheckpoint(filepath='weight.h',monitor='accuracy',verbose=1, save_best_only=True) # 8.保存准确率最⾼的模型 2分
36
adadelta = Adadelta()#9.建⽴adadelta模型优化器 2分
37
model.compile(loss='binary_crossentropy',optimizer=adadelta,metrics=['accuracy'])#10.选择损失函数，优化器 2分
38
model.summary()
39
model.fit_generator(
40
 train,
41
 steps_per_epoch=600//batchsize,
42
 epochs=2,#11.设置迭代次数 2分
43
 callbacks=[best_model])

LSTM模型#

1
import math
2
import keras as K
3
from pandas import DataFrame
4
from pandas import concat
5
from numpy import concatenate
6
from pandas import read_csv
7
import numpy as np
8
from sklearn.preprocessing import MinMaxScaler
9
from sklearn.preprocessing import LabelEncoder
10
import matplotlib.pyplot as pyplot
11
from keras.models import Sequential
12
from keras.layers import Dense
13
from keras.layers import LSTM
14
import sklearn.metrics
15
from sklearn.metrics import mean_squared_error
16
from sklearn.metrics import median_absolute_error
17
from keras.layers import Dropout
18
dataset = read_csv('./data/3.3/data.csv')#1.设置数据读取参数 2分
19
dataset.info()
20
values = dataset.values
21
values = values.astype('float32')# 2.将数据转化为float格式 2分
22
# 归⼀化
23
scaler = MinMaxScaler(feature_range=(0, 1))#3.建⽴归⼀化函数 2分 或不填，默认就是这个
24
scaled = scaler.fit_transform(values)#4.归⼀化处理 2分
25
# 框架作为监督学习
26
value2 = scaled
27
n_train_hours = int(len(dataset) * 0.8) k=int(len(dataset) * 0.9)
28
train = value2[:n_train_hours, :]
29
test = value2[n_train_hours:k, :]
30
cheak = value2[k:, :]
31

32
# 建⽴输⼊和输出集
33
train_X, train_y = train[:, :-1], train[:, -1]#5.建⽴训练集输⼊输出集 2分
34
test_X, test_y = test[:, :-1], test[:, -1] #6.建⽴测试集集输⼊输出集 2分
35
cheak_X, cheak_y = cheak[:, :-1], cheak[:, -1]
36
# 将输⼊集转化为3维
37
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
38
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
39
cheak_X = cheak_X.reshape((cheak_X.shape[0], 1, cheak_X.shape[1]))
40

41
# 7.构建神经⽹络模型
42
model = Sequential()
43
model.add(LSTM(50,input_shape=(train_X.shape[1], train_X.shape[2]))) # 1分
44
model.add(Dense(1)) # 1分
45
model.compile(loss='mae',optimizer='adam') # 1分
46
# 调整神经⽹络
47
history = model.fit(
48
 train_X, train_y,
49
 epochs=10, # 8.设置最⼤迭代次数 1分
50
 batch_size=50, # 9.设置batchsize⼤⼩ 1分
51
 validation_data=(test_X, test_y), verbose=2, shuffle=False)
52
pyplot.plot(history.history['loss'],label='loass')#10.绘制损失值曲线 1分
53
pyplot.legend()
54
pyplot.show()
55

56
# 进⾏预测
57
ycheak = model.predict(cheak_X)#11.对测试集进⾏预测 2分
58
cheak_X = cheak_X.reshape((cheak_X.shape[0], cheak_X.shape[2])) #
59

60
inv_cheak = concatenate((cheak_X[:, 0:],ycheak), axis=1) # 与预测值合并，按列⽅向
61
inv_cheak = scaler.inverse_transform(inv_cheak)
62
inv_cheak =inv_cheak[:,6]
63
cheak_y = cheak_y.reshape((len(cheak_y), 1))
64

65
inv_c = concatenate((cheak_X[:, 0:],cheak_y), axis=1) # 与真实值合并，按列⽅向
66
inv_c = scaler.inverse_transform(inv_c)
67
inv_c = inv_c[:,6]
68
pyplot.plot(inv_cheak)#12.绘制预测曲线 2分
69
pyplot.plot(inv_c)#13.绘制实际曲线 2分
70
pyplot.show()