Numpy
练习
参考官⽅⽹址:https://numpy.org/doc/stable/user/index.html
初步例⼦
numpy 是数据科学中的基础库包,经常和其他库包使⽤
import numpy as npimport matplotlib.pyplot as pltx=np.arange(-np.pi,np.pi,0.01) # 产⽣ -pi 到 pi 之间数,步⻓为 0.01y=np.sin(x)plt.plot(x,y)plt.show()
import numpy as npimport matplotlib.pyplot as pltx=np.random.randn(1000) # 产⽣ 1000 个浮点数,满⾜标准正态分布(均值为 0 ,⽅差为 1 )print('mean=',np.mean(x),"std=",np.std(x))# 输出平均数和⽅差plt.hist(x,bins=50) # 画柱状图plt.show()
导⼊ numpy 模块
import numpy as np # 导⼊ numpy 包,起 np 的别名
善于使⽤帮助⾮常重要!!
- 输⼊ np. 后按 tab 键,可显示所有的属性和⽅法,或在.后多敲⼏个字⺟再按 tab 键,可以缩⼩候选数
- 查看属性和⽅法的具体使⽤可以⽤help函数,例如:
help(np.array)
# help(np)help(np.array) # 查看相关帮助信息 , 在 cell 中显示
通过 Python 数据集构造 NumPy 的 ndarray 数组对象
import numpy as np# 构造⼀维数组a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float) # 指定数组元素类型为 floatprint(type(a1)) # a1 的类型为 numpy.ndarrayprint('a1=',a1,'\n')# 构造⼆维数组a2=np.array([[1,2,3,4,5],[6,7,8,9,10]]) # 默认类型为最⼩存放给定数据的的数据类型print('a2=',a2)
特殊数组构造
import numpy as np# 在指定的间隔范围内 [1,10], 返回均匀间隔的 30 个数字al=np.linspace(1,10,30)print(al)# 构造全 0 的数组a0=np.zeros(10,dtype=int)print('a0=',a0,'\n')# 构造全 0 的矩阵a0=np.zeros((3, 3)) # 不指定类型则为浮点数print(a0,'\n')# 从 0 开始,到 10 结束 ( 不包括 ) ,步⻓为 2 ,不写默认为 1a1=np.arange(0,10,2)print('a1=',a1,'\n')# 构造全 1 的数组a2=np.ones((3,5),dtype=float)print('a2=',a2,'\n')# 构造全 3.14 的数组a3=np.full((3,5),3.14)print("a3=",a3,'\n')# 产⽣随机阵a4=np.random.random([5, 5])print("a4=",a4,'\n')# 创建⼀个 3*3 的, [0 , 10) 区间的随机整型数a5=np.random.randint(0,10,(3,3))print("a5=",a5,'\n')# 创建⼀个 3*3 的单位矩阵a6=np.eye(3)print("a6=",a6,'\n')
随机数产⽣
import numpy as np# 参考⽹址: https://blog.csdn.net/u012149181/article/details/78913167np.random.rand(3,3) # 产⽣每个值在 [0,1) 范围的 3X3 矩阵
np.random.randint(1,5,[3,3]) # 产⽣最⼩值为 1 ,最⼤值不超过 5 的 3X3 的随机矩阵
np.random.randn(3,3) # 产⽣符合正态分布的 3X3 的随机矩阵
# 产⽣随机阵a4=np.random.random([5, 5])# 产⽣值在 (0,1) 之间 5X5 随机矩阵print("a4=",a4,'\n')print(a4.ndim) # 显示维数print(a4.shape) # 显示维数形状
数组维度改变
import numpy as np# 改变⾏列a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float)a2=a1.reshape(2,5)print(a2,'\n')# 展平数组,即变成⼀维数组 或使⽤ a2.ravel()a3=a2.flatten()print(a3,'\n')# 数组维数print(a2.shape)print(a3.shape)a3.shape=(5,2)print(a3.shape)
array1=np.array([[[0,1,2,3],[4,5,6,7],[8,9,10,11]],[[12,13,14,15],[16,17,18,19],[20,21,22,23]]])print(array1.shape)array1.shape=(6,4)print(array1)
# 转置a1=np.array([1,2,3,4,5,6,7,8,9,10],dtype=float)a2=a1.reshape(2,5)print(a2)a3=a2.Tprint(a3)
基本运算
# 基本统计信息print(a3.max())# 求最⼤值print(a3.mean()) # 求平均值,# mean 函数调⽤格式 ndarray.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)
# 数组整体运算a2=np.arange(10).reshape(2,5)max=a2.max()min=a2.min()a2 = (a2-min)/(max-min) # 数组中每个元素与 min , max 参与运算print(a2)In [ ]:import numpy as npX=np.random.randint(0,10,[3,3])print(X)Y = X - X.mean(axis=1, keepdims=True) # 实现每⾏上数据减去这⾏的平均值# mean 函数调⽤格式 ndarray.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)print(Y)
# 产⽣随机数xlist=np.random.randint(10, size=(10))ylist=np.random.rand(10)ylist
# 通⽤函数 ufunc ⼀元函数import numpy as npa=np.array([1,-1,2,-2,3])a1=np.abs(a)print(a1)a2=np.square(a)print(a2)
# 通⽤函数 ufunc ⼆元函数a=np.array([1,-1,2,-2,3])b=np.arange(1,6)c=np.add(a,b)print(c)d=np.subtract(a,b)
矩阵运算
# 复杂运算# 堆叠数组help(np.hstack)help(np.vstack)# 拆分数组help(np.hsplit)help(np.vsplit)help(np.split)
help(np.mat)
A=np.mat("1 2 3;4 5 6") # 构造 2*3 的矩阵print(A)B=np.array([[1, 2],[3,4], [5, 6]])# 构造 3*2 的矩阵print(B)a=[[1,2,3],[4,5,6],[7,8,9]] # 构造 3*3 的矩阵C=np.array(a)print(C)
D=A.dot(B) # 矩阵运算print(D)
a=[[1,2,3],[4,5,6],[7,8,9]]arr3=np.array(a)tt=np.tril(arr3) # 下三⻆print(tt)tt=np.triu(arr3) # 上三⻆print(tt)
# np.mgrid(start : end : step) # ⽣成等差数组, [start : end) 闭包关系import numpy as np # 导⼊ numpy 模块a = np.mgrid[1:4:1] # ⽣成等差数组 a# np.mgrid[0:5,0:5] # 按⾏递增,然后按列递增,两个数组再堆叠起来print(a)# help(np.mgrid)
任务题
1.安装 Numpy 工具包
sudo apt install python-numpysudo pip3 install numpy
2.创建一个长度为10的一维全为0的ndarray对象,然后让第5个元素等于1
import numpy as np
array = np.zeros(10)array[4] = 1print(array)
3.创建一个元素为从10到49的ndarray对象,并将所有元素位置反转
import numpy as np
array = np.arange(10, 49+1)array = np.flipud(array)print(array)
4.使用np.random.random创建一个10*10的ndarray对象,并打印出最大最小元素
import numpy as np
array = np.random.random((10,10))
print("max:{}".format(array.max()))print("min:{}".format(array.min()))
5.正则化一个5*5随机矩阵.(正则的概念:假设a是矩阵中的一个元素,max/min分别是矩阵元素的最大最小值,则正则化后a = (a-min)/(max-min) )
import numpy as np
array = np.random.random((5,5))print("origin:")print(array)print(20*"-")array_min = array.min()array_max = array.max()array = (array-array_min) / (array_max - array_min)print("正则化:")print(array)
Pandas
安装 Pandas
可以使用如下指令安装 pandas:
pip3 install pandas
如果安装超时或失败,可以尝试使用清华源:
pip3 install pip3 -Upip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/
练习
导入Pandas库包
官⽅参考⽹站:
https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html#
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
Pandas 中有两个重要的数据结构: series 和 DataFrame 。在使⽤ Pandas 之前,需要理解这两种数据结构的相关知识。
# 导⼊ pandas 库包为 pd 名字import pandas as pd# 导⼊ numpy 库包为 np 名字import numpy as np
任务题
读取 goods.csv 文件中的行数和列数,并输出前10行的内容
import pandas as pdimport numpy as np
df = pd.read_csv("./goods.csv", sep=',')# sep默认为','print(df.head(10))
打印全部列名,打印数据集的索引
import pandas as pdimport numpy as np
df = pd.read_csv("./goods.csv")print(df.columns)#全部列名print(df.index)#数据集的索引
找出 goods 数据集 item_name 列中,一共有多少种商品被下单,被下单数最多的商品(item)是什么?
import pandas as pdimport numpy as np
df = pd.read_csv("./goods.csv")
print(pd.unique(df['item_name']).size) #一共有多少种商品被下单
print(df['item_name'].value_counts().head(1)) #被下单数最多的商品(item)是什么
在 choice_description 中,销售量最多的商品是什么?
import pandas as pdimport numpy as np
df = pd.read_csv("./goods.csv")print(df['choice_description'].value_counts().head(1))
将 item_price 转换为浮点型,并计算总收入
import pandas as pdimport numpy as np
df = pd.read_csv("./goods.csv")f = lambda x:float(x[1:-1])df['item_price'] = df['item_price'].apply(f)
df['sub_total'] = round(df['item_price'] * df['quantity'])print(df['sub_total'].sum())
测试题
1.1 泰坦尼克乘客数据.csv
# 1. 导入Pandas库包为pdimport pandas as pd# 2. 使用pandas读csv格式文件‘泰坦尼克乘客数据.csv',读的结果给titantic变量titantic = pd.read_csv('./泰坦尼克乘客数据.csv')# 3. 显示前5行数据print(titantic.head(5))# 4. 显示后4行数据print(titantic.tail(4))# 5. 查看数据集摘要信息print(titantic.info())# 6. 取数据集的'pclass','age','sex' 列,作为X的数据(即赋值给X)X = titantic[['pclass','age','sex']]# 7. 取数据集的'survived'列,作为Y的数据Y = titantic['survived']# 8. 将X数据中'age'列中缺失的数据用'age'列的平均值替换X['age'].fillna(X['age'].mean(), inplace=True)# 9. 显示替换后数据X的摘要信息print(X.info())
1.2 breast-cancer-wisconsin.data
# 1. 导入库包numpy 为npimport numpy as np# 2. 导入库包pandas 为pdimport pandas as pd# 3. 给定读入数据的列名column_names=['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']# 4. 按给定的列名读文件数据,并赋值给data变量# 数据文件在https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data# 或从本地data文件夹下读该‘breast-cancer-wisconsin.data'文件data = pd.read_csv("./breast-cancer-wisconsin.data", header=None, names=column_names)print(data)# 5. 显示数据集的前4行数据print(data.head(4))# 6. 查看数据集摘要信息print(data.info())# 7. 将数据中所有'?'的数据用np.nan值替换data = data.replace(to_replace='?',value=np.nan)# 8. 删除数据中值为np.nan的数据用,删除方式是:If any NA values are present, drop that row or column.data = data.dropna(how='any',axis=0)# 9. 显示删除缺失值后的数据集信息print(data.info())# 10. 查看数据集的维数,即行列数print(data.shape)
2.1 水质参数.xlsx
# 1. 导入Pandas库包为pdimport pandas as pd# 2. 利用Pandas读Excel文件‘水质参数.xlsx’,得数据集data# bug:ImportError: Missing optional dependency 'openpyxl'. Use pip or conda to install openpyxl.data = pd.read_excel("水质参数.xlsx")# 3. 打印方式查看数据统计特性print(data.describe())# 4. 显示前4行数据print(data.head(4))# 5. 取'溶解氧(DO)'列数据,形成一个Series,并将其值values取出赋给yy = pd.Series(data['溶解氧(DO)']).values# 6. 将数据集data'溶解氧'所在的列删除,赋给xx = data.drop(['溶解氧(DO)'], axis=1)# 7. 显示x的值print(x.values)
2.3 data2.xlsx
# 1. 导入库包numpy 为np,导入库包pandas 为pdimport numpy as npimport pandas as pd# 2. 利用Pandas读Excel文件‘data2.xlsx’,得数据集datadata = pd.read_excel("./data2.xlsx")# 3. 显示数据集data的统计特性print(data.describe())# 4. 显示前5行数据print(data.head(5))# 5. 取溶解氧列数据,形成一个Series,并将其值取出赋给yy = pd.Series(data['溶解氧']).values# 6. 将数据集data溶解氧所在的列删除,并将删除后数据集的values赋给xx = data.drop(['溶解氧'], axis=1)# 7. 显示x的值print(x.values)
3.3 data.csv
# 1. 导入库包numpy 为np,导入库包pandas 为pdimport numpy as npimport pandas as pd# 2. 利用Pandas读csv文件‘data.csv’,得数据集dataset,第一行数据为列名,即选取文件的第一行作为表头,第一列作为indexdataset = pd.read_csv("./data.csv", index_col=0)# 3. 显示数据集dataset的统计特性print(dataset.describe())# 4. 取dataset的values,赋给values变量values = dataset.values# 5. 确保values所有数据都是float(实际就是转换values中数据为float型),并重新赋值给valuesvalues = values.astype(float)# 6. 显示values的数据print(values)
Matplotlib
安装 Matplotlib
Ubuntu 下安装 Matplotlib 有两种方法:
pip3 install matplotlib 或sudo apt install python3-matplotliab
如果官方源下不动,将 python 源改为清华源。
任务题
给定横坐标为x=[1,2,3,4],纵坐标为a=[1,4,5,9] ,b = [2,8,6,3],将其绘制在同一张图中标签分别为 LineA 和 LineB,其中LineA 为红色折线图,LineB 为蓝色点线图,并设置图名为 Graph1。
import matplotlib.pyplot as plt
x = [1, 2, 3, 4]a = [1, 4, 5, 9]b = [2, 8, 6, 3]plt.plot(x, a, 'r')plt.plot(x, b, 'b', linestyle=':')plt.title("Graph1")plt.show()
2.使用 Numpy 和 Matplotlib 绘制1-11范围内的函数曲线 y=x*x+1。并设置图名为 Graph2,横轴和纵轴名分别为 x 和与 y 。
import matplotlib.pyplot as pltimport numpy as np
x = np.arange(1, 11)y = x * x + 1plt.plot(x, y)plt.title("Graph2")plt.show()
3.使用 matplotlib 绘制绘制(1,2,3,4,5)及其平方的的散点图。设置点的大小为40,颜色为红色,并删除数据点轮廓横轴和纵轴名分别为 x 和与 y (字体大小设为14),并设置图名为 Graph3(字体大小设为24)。
import matplotlib.pyplot as pltimport numpy as np
x = np.arange(1, 6)y = x * xplt.scatter(x, y, s=40, c='red')plt.xlabel("x", {'size':14,})plt.ylabel("y", {'size':14,})plt.title("Graph3", {'size':24,})plt.show()
4.给定两对数据y1=[10,25,30],x1=[1,3,5],y2=[50,15,80],x2=[2,4,6]。其中 x1, x2 为横坐标 y1, y2 为纵坐标。分别绘制条形图其中 x1 的条形图为绿色,x2 的条形图为红色,横轴和纵轴名分别为 x 和 y ,并设置图名为 Graph4。
import matplotlib.pyplot as plt
y1 = [10, 25, 30]x1 = [1, 3, 5]y2 = [50, 15, 80]x2 = [2, 4, 6]plt.bar(x1, y1, color="green")plt.bar(x2, y2, color="red")plt.title("Graph4")plt.show()
5.给定一组数 [22,87,5,43,56,73,55,54,11,20,51,5,79,31,27] 分布于 0-100,请使用 hist 函数以 20 为间隔绘制频数统计图,并设置图名为 Graph5 。
import matplotlib.pyplot as pltimport numpy as np
y = [22, 87, 5, 43, 56, 73, 55, 54, 11, 20, 51, 5, 79, 31, 27]plt.hist(y, np.arange(0, 101, 20))plt.show()
6.某班级同学参加三类活动A,B,C分别有15,20,35人画出各类活动的饼状图。图像为正视图,各类活动显示百分比(精确到小数点后两位),并设置图名为 Graph6 。
import matplotlib.pyplot as plt
labels = ['A', 'B', 'C']sizes = [15, 20, 35]explode = (0, 0, 0)plt.pie(sizes, explode, labels, autopct='%1.1f%%')plt.title("Graph6")plt.show()
7.利用数据 x = [1,2,3,4,5], y = [1,2,4,3,5],画出如下图形,其中,第1个 axes 标题大小为 12 ,总图形标题大小为 20 。
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5]y = [1, 2, 4, 3, 5]fig, axes = plt.subplots(1, 3, figsize=(14,4))ax1, ax2, ax3 = axes[0], axes[1], axes[2]ax1.plot(x, y)ax2.scatter(x, y)ax3.barh(x, y)ax1.set_title("ax1 title", fontsize=12)fig.suptitle("figure title <subplots test>",fontsize=20)plt.show()
8.读取data目录下的 iris.csv
文件中的数据,画出不同种类(species)鸢尾花萼片和花瓣的大小关系(分类散点子图)。
# 0. 导⼊所需库包import pandas as pdimport matplotlib.pyplot as plt# 1. 利⽤ pandas 读 iris.csv ⽂件df=pd.read_csv('./iris.csv')# 2. 显示读出数据的前 5 ⾏print(df.head())# 3. 查看数据的概要信息df.info()# 4. 计算 sepal 尺⼨ =sepal_length*sepal_widthdf['sepal_size']=df['sepal_length']*df['sepal_width']df['petal_size']=df['petal_length']*df['petal_width']# 5. 取出所有种类 species 名称species=df['species'].unique()# 6. 取出每类数据data1=df[df['species']==species[0]]data2=df[df['species']==species[1]]data3=df[df['species']==species[2]]# 7. 对每类数据画散点图fig, ax2_2 = plt.subplots()ax2_2.scatter(data1['sepal_size'],data1['petal_size'],color = '#ff0000',label=species[0])ax2_2.scatter(data2['sepal_size'],data2['petal_size'],color = '#00ff00',label =species[1])ax2_2.scatter(data3['sepal_size'],data3['petal_size'],color = '#0000ff',label=species[2])# 8. 添加图例ax2_2.legend(loc = 'best')# 9. 添加标题和坐标说明ax2_2.set_title('Size of Sepal vs Size of Petal')ax2_2.set_xlabel('size of sepal')ax2_2.set_ylabel('size of petal')# 10. 显示图形plt.show()
SKlearn
安装SKlearn
Ubuntu 下安装 SKlearn 有两种方法:
pip3 install SKlearn 或sudo apt install python3-SKlearn
如果官方源下不动,可以查看人工智能 Pandas,将python源改为清华源。
任务题
KMeans
# Created On 2021.10. By 摩羯(whitelot@163.com)# https://zodiaclab.top/# 使用PyCharm编译器创建Python程序,搭建K-Means算法处理实现鸢尾花数据的聚类问题。使用K-Means算法对鸢尾花的数据进行聚类并将聚类结果进行输出。
from sklearn import datasetsfrom sklearn.cluster import KMeansimport pandas as pdimport matplotlib.pyplot as plt
# 从sklearn中下载鸢尾花数据集iris = datasets.load_iris()iris_data = iris.datairis_target = iris.targetlabel0 = {"x":[], "y":[]}label1 = {"x":[], "y":[]}label2 = {"x":[], "y":[]}for index in range(len(iris_target)):if iris_target[index] == 0:label0["x"].append(iris_data[index][-2])label0["y"].append(iris_data[index][-1])elif iris_target[index] == 1:label1["x"].append(iris_data[index][-2])label1["y"].append(iris_data[index][-1])elif iris_target[index] == 2:label2["x"].append(iris_data[index][-2])label2["y"].append(iris_data[index][-1])plt.scatter(label0["x"], label0["y"], c="r", marker='o')plt.scatter(label1["x"], label1["y"], c="g", marker='*')plt.scatter(label2["x"], label2["y"], c="b", marker='+')plt.legend(["setosa", "versicolor", "virginica"])plt.title("Real", fontsize=20)plt.xlabel("petal length (cm)", fontsize=12)plt.ylabel("petal width (cm)", fontsize=12)plt.show()# df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)# df_iris['species'] = iris.target# for index in range(df_iris.shape[0]):# if df_iris['species'][index] == 0:# df_iris['species'][index] = "setosa"# elif df_iris['species'][index] == 1:# df_iris['species'][index] = "versicolor"# elif df_iris['species'][index] == 2:# df_iris['species'][index] = "virginica"# df_iris.to_csv('./iris.csv', index=None)
# df_iris.info()
# plt.scatter(df_iris["petal length (cm)"], df_iris["petal width (cm)"])# plt.xlabel("petal length (cm)", fontsize=12)# plt.ylabel("petal width (cm)", fontsize=12)# plt.show()
X = iris.data[:, 2:] # 只取特征空间中的后两个维度
estimator = KMeans(n_clusters=3) # 构造聚类器estimator.fit(X) # 聚类label_pred = estimator.labels_# 按聚类后标签进行分类label0 = {"x":[], "y":[]}label1 = {"x":[], "y":[]}label2 = {"x":[], "y":[]}for index in range(len(label_pred)):if label_pred[index] == 0:label0["x"].append(X[index][0])label0["y"].append(X[index][1])elif label_pred[index] == 1:label1["x"].append(X[index][0])label1["y"].append(X[index][1])elif label_pred[index] == 2:label2["x"].append(X[index][0])label2["y"].append(X[index][1])plt.scatter(label0["x"], label0["y"], c="r", marker='o')plt.scatter(label1["x"], label1["y"], c="g", marker='*')plt.scatter(label2["x"], label2["y"], c="b", marker='+')plt.legend(["setosa", "versicolor", "virginica"])plt.title("KMeans", fontsize=20)plt.xlabel("petal length (cm)", fontsize=12)plt.ylabel("petal width (cm)", fontsize=12)plt.show()
逻辑回归
# Created On 2021.10. By 摩羯(whitelot@163.com)# https://zodiaclab.top/# 使用PyCharm编译器创建Python程序,搭建并训练逻辑回归分类器处理鸢尾花分类问题。使用已训练的分类器对测试集中的鸢尾花数据进行分类并对分类结果进行多性能指标评估。
from sklearn import datasetsfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.metrics import classification_report
iris = datasets.load_iris()X = iris.dataY = iris.target
# 数据预览print("样本数据:")print(X[:10])print("标签数据:")print(Y[:10])
# 按照8:2的比例划分为训练集和测试集x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)print("Length of train {}, Length of test {}".format(len(x_train), len(x_test)))
# 极大极小值归一化MinMax_x = MinMaxScaler()x_train = MinMax_x.fit_transform(x_train)x_test = MinMax_x.transform(x_test)
# 构建逻辑回归模型lr = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='multinomial')
# 模型训练mode=lr.fit(x_train, y_train)
# ⽤评估器的 score 函数评估模型accuracy=lr.score(x_test,y_test)print('准确度为:%.3f' % accuracy)
# 预测y_pre=mode.predict(x_test)
target_names = ["setosa", "versicolor", "virginica"]print(classification_report(y_test, y_pre, target_names=target_names))
线性分类器
# Created On 2021.10. By 摩羯(whitelot@163.com)# https://zodiaclab.top/# 使用PyCharm编译器创建Python程序,搭建并训练线性分类器处理良恶性乳腺癌肿瘤预测问题。使用已训练分类器对测试集中的肿瘤类别进行预测并对预测结果进行多性能指标评估。
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerfrom sklearn.linear_model import LogisticRegressionfrom sklearn.linear_model import SGDClassifierfrom sklearn.metrics import classification_report
# 数据统计特性column_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]df = pd.read_csv("./data/breast-cancer-wisconsin.data", header=None, names=column_names)print(df.info())
# 筛选补全数据df = df.replace(to_replace='?',value=np.nan)df = df.dropna()print(df.info())
# 数据划分x_train, x_test, y_train, y_test = train_test_split(df[column_names[0:10]], df[column_names[10]], test_size=0.25, shuffle=True)print("训练集统计特性:")print(x_train.info())print("测试集统计特性:")print(x_test.info())
# 使用StandardScaler()函数将数据标准化std_x = StandardScaler()x_train = std_x.fit_transform(x_train)x_test = std_x.transform(x_test)
# 初始化LogisticRegression与SGDClassifierlr = LogisticRegression()sgdc = SGDClassifier()
# lr模型训练与预测lr.fit(x_train, y_train)lr_y_predict = lr.predict(x_test)
# sgdc模型训练与预测sgdc.fit(x_train, y_train)sgdc_y_predict = sgdc.predict(x_test)
# score评分lr_accuracy=lr.score(x_test,y_test)print('LogisticRegression 确度为:%.3f' % lr_accuracy)sgdc_accuracy=sgdc.score(x_test,y_test)print('SGDClassifier 准确度为:%.3f' % sgdc_accuracy)
# 多性能指标评估print("LogisticRegression:")print(classification_report(y_test, lr_y_predict))print("SGDClassifier:")print(classification_report(y_test, sgdc_y_predict))
集成学习
# Created On 2021.10. By 摩羯(whitelot@163.com)# https://zodiaclab.top/# 使用PyCharm编译器创建Python程序,搭建并使用随机森林处理泰坦尼克号乘客生存预测问题。导入数据和必要包,选取特征,补充缺失值并对预测结果进行多性能指标评估。
import pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction import DictVectorizerfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import classification_report
# 数据统计特性titanic = pd.read_csv("./data/titanic.csv")print(titanic.info())
# 筛选特征补全数据df = titanic[['pclass','age','sex', 'survived']]df['age'].fillna(df['age'].mean(), inplace=True)print(df.info())
# 数据划分x_train, x_test, y_train, y_test = train_test_split(df[['pclass','age','sex']], df['survived'], test_size=0.25, shuffle=True)print("训练集统计特性:")print(x_train.info())print("测试集统计特性:")print(x_test.info())
vec = DictVectorizer()x_train = vec.fit_transform(x_train.to_dict(orient='records'))x_test = vec.transform(x_test.to_dict(orient='records'))print(vec.feature_names_)
dtc = DecisionTreeClassifier()dtc.fit(x_train, y_train)
accuracy=dtc.score(x_test,y_test)print('准确度为:%.3f' % accuracy)
dt_predict = dtc.predict(x_test)print(classification_report(y_test, dt_predict))
# RandomForestClassifierrfc = RandomForestClassifier()rfc.fit(x_train, y_train)
rfc_accuracy=rfc.score(x_test,y_test)print('准确度为:%.3f' % rfc_accuracy)
rfc_predict = rfc.predict(x_test)print(classification_report(y_test, rfc_predict))
多项式朴素贝叶斯
这道题上可能有点问题,题目问题描述的比较模糊,就当参考看一下就行。
# Created On 2021.10. By 摩羯(whitelot@163.com)# https://zodiaclab.top/# 使用PyCharm编译器创建Python程序,搭建并创建贝叶斯模型实现影评观众情绪分类问题。将数据集按7:3的比例划分为训练集和测试集,使用朴素贝叶斯分类器对训练集进行训练,并使用测试集测试影评观众情绪分类的准确性。
import pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn import naive_bayesfrom sklearn import metricsimport numpy as np
df = pd.read_csv("./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
# 数据储存结构print(df.info())
# 数据划分x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, shuffle=True)
vectorizer = CountVectorizer()features = vectorizer.fit_transform(x_train)test_features = vectorizer.transform(x_test)
# 单词分类#print(vectorizer.get_feature_names())
# 朴素⻉叶斯算法 NBmodel = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)model.fit(features, y_train)
# 每一类对应的概率probility = model.predict_proba(test_features)print(probility)
# 准确率y_predict = model.predict(test_features)accuracy = metrics.accuracy_score(y_test, y_predict)print('准确度为:%.3f' % accuracy)
Keras
介绍
Keras是一个高层神经网络API,由纯Python编写而成,并以Tensorflow、Theano以及CNTK为后端。
需要注意的是,在TensorFlow2.x中Keras已经成为其高级API。
所以你的项目无论是用Keras还是Tensorflow都差不了多少,总体上Keras会比Tensorflow简单一点。
任务题
Keras基础
定义一个多类别分类(multi-class classification)的多层感知器(MLP)模型。
该模型有784个输入,3个隐藏层,分别为512,216和128个隐藏神经元,输出层有10个输出。
在每个隐藏层中使用relu激活函数,并且在输出层中使用softmax激活函数进行多类别分类。
from keras.models import Sequentialfrom keras.layers import Dense
n_input = 784n_hidden_1 = 512n_hidden_2 = 216n_hidden_3 = 128n_classes = 10
model = Sequential()
model.add(Dense(n_hidden_1, activation="relu", input_dim=n_input))model.add(Dense(n_hidden_2, activation="relu"))model.add(Dense(n_hidden_3, activation="relu"))model.add(Dense(n_classes, activation="softmax"))
定义一个用于图像分类的卷积神经网络(Convolutional neural network)。
该模型接收灰阶的28 * 28图像作为输入,然后有一个作为特征提取器的两个卷积层(卷积层)和池化层(选最大池化,池化窗口大小为2X2)的序列,即一个卷积层后面一个池化层,然后再一个卷积层后面一个池化层,两个卷积层的filters分别为128和64,激活函数选relu,kernel_size均为4X4,池化层,均选最大池化,池化窗口大小为2X2。
然后是一个完全连接层来解释特征,该全连接层有64个神经元,激活函数选relu,并且具有用于10类预测的softmax激活的输出层。
from tensorflow import kerasfrom tensorflow.keras import layers
input_shape = (28, 28, 1)num_classes = 10
model = keras.Sequential([keras.Input(shape=input_shape),layers.Conv2D(128, kernel_size=(4,4), activation="relu"),layers.MaxPooling2D(pool_size=(2,2)),layers.Conv2D(64, kernel_size=(4,4), activation="relu"),layers.MaxPooling2D(pool_size=(2,2)),layers.Dense(64, activation="relu"),layers.Dense(num_classes, activation="softmax")])
定义一个长短期记忆(LSTM)递归神经网络用于图像分类。
该模型预期一个特征的784个时间步骤作为输入。
该模型具有单个LSTM隐藏层以从序列中提取特征,LSTM隐藏层的神经元数为128,接着是一个全连接层来解释LSTM输出,该全连接层的神经元数为128,激活函数选relu,接着是用于进行10类别预测的输出层,输出层激活函数选softmax。
from keras.models import Sequentialfrom keras.layers import LSTM, Dense
n_input = 784num_classes = 10
model = Sequential()
model.add(LSTM(128, input_dim=n_input))model.add(Dense(128, activation="relu"))model.add(Dense(num_classes, activation="softmax"))
BP神经网络
这里关于Adma有个版本问题,如果你是在Windows平台上请使用 “from keras.optimizers import Adam”。
如果是Mac或Linux可以使用如下代码中的两种Import方法。
import pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScaler
from keras.models import Sequentialfrom keras.layers import Densefrom keras.optimizers import adam_v2#from tensorflow.keras.optimizers import Adamimport matplotlib.pyplot as plt
# 利用pandas读'数据集有氨氮的数据.xlsx' 到datadata = pd.read_excel("./数据/数据集有氨氮的数据.xlsx")# 取data溶解氧列数据给yy =data["溶解氧"].values# 将data数据删除溶解氧列后的结果赋给xx = data.drop("溶解氧", axis=1).valuesprint(x)# 利用sklearn.preprocessing 的 StandardScaler 标准化数据集scaler = StandardScaler()x = scaler.fit_transform(x)
# 利用 sklearn.model_selection 的 train_test_split 设定训练集和测试集,测试集占%20x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# 创建keras的Sequential模型model = Sequential()# 搭建3层的BP神经网络的结构,units表示隐含层神经元数,input_dim表示输入层神经元数,# activation表示激活函数model.add(Dense(14, activation='sigmoid', input_dim=6))model.add(Dense(1, activation='sigmoid'))
# 编译模型,参数loss表示损失函数,这里损失函数为mse,优化算法采用Adam,metrics表示训练集的拟合误差model.compile(loss='mse', optimizer=adam_v2.Adam(learning_rate=0.01), metrics=['mape'])# 显示模型摘要信息model.summary()
# 模型训练# 将训练集的x和y输入到BP神经网络进行训练,epoch表示训练次数,# batch_size表示每次训练的训练集大小,此处为24)使用sklearn对输入数据进行极大极小归一化。
history = model.fit(x_train, y_train, batch_size=10, epochs=200)
# 评估模型score = model.evaluate(x_test, y_test, verbose=0)print("Test accuracy:", score[1])
# 取history中历史损失值loss = history.history['loss']# 计算轮次epochs = range(len(loss))# 利用轮次和损失值画线条图plt.plot(epochs, loss, label="Train_loss")# 显示图例plt.legend()plt.show()
# 利用x_test数据,进行测试集的预测result = model.predict(x_test)# 显示真实数据print(y_test)# 显示预测结果print('测试集的预测结果为:', result)#对预测结果和实际值进行可视化plt.figure()plt.plot(y_test, label='true data')plt.plot(result, 'r:',label='predict')plt.legend()plt.show()
损失函数图
预测结果图
人工智能 2021考试题库
由于Linux和Windows的路径不同,执行前需要先修改数据文件的路径。
Class One
决策树模型
# 根据上述要求补全下列代码:import pandas as pdtitantic = pd.read_csv('./data/泰坦尼克乘客数据.txt')#1.读取数据⽂件 3分titantic.head()titantic.info()X = titantic[['pclass','age','sex']]#2.筛选pclass,age,sex关键因素 2分Y = titantic['survived']#3,使⽤平均值对age的缺失数据进⾏补全 3分#X['age'].fillna(X['age'].mean(),inplace=True) # 注意inplace=True不能少X=X.fillna({'age':X['age'].mean()})X.info()# from sklearn.model_selection import KFoldfrom sklearn.model_selection import train_test_splitX_train,X_test,Y_tran,Y_test = train_test_split(X,Y,test_size=0.2,random_state=33) #4.划分数据集 2分# random_state随机数种⼦:其实就是该组随机数的编号,在需要重复试验的时候,保证得到⼀组⼀样的随机数X_train.info()X_test.info()from sklearn.feature_extraction import DictVectorizervec=DictVectorizer(sparse=True)#5.进⾏特征转换 4分 sparse=True 转换后的量是否为稀疏矩阵的形式X_train = vec.fit_transform(X_train.to_dict(orient='records'))X_test = vec.transform(X_test.to_dict(orient='records'))print(X_train)#6.初始化决策树模型 3分from sklearn.tree import DecisionTreeClassifierdtc = DecisionTreeClassifier()#7.进⾏训练 3分dtc.fit(X_train,Y_tran)#8,进⾏预测并输出 4分y_predict = dtc.predict(X_test)print(y_predict)#9.评估分类性能并输出 4分from sklearn.metrics import classification_reportprint (dtc.score(X_test,Y_test))result=classification_report(Y_test,y_predict,target_names=['died','survived'])print(result)
K近邻分类器
import pandas as pdimport numpy as npcolumn_names=['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin']data=pd.read_csv('./data/breast-cancer-wisconsin.data',names=column_names)#1.读取⽂件 3分data.info()#2.查看数据统计特性 2分
data=data.replace(to_replace='?',value=np.nan) #3.将‘?’替换为标准缺失值表示 2分data=data.dropna(how='any')#4.丢弃带有缺失值的数据(只要有⼀个维度有缺失) 3分data.info()data.shape
from sklearn.model_selection import train_test_splitX_train,X_test,Y_train,Y_test=train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.2,random_state=33)#5.数据集划分 2分X_train.info()X_test.info()
from sklearn.preprocessing import StandardScalerfrom sklearn.neighbors import KNeighborsClassifierss=StandardScaler()#6.数据标准化 2分X_train=ss.fit_transform(X_train)X_test=ss.fit_transform(X_test)knc=KNeighborsClassifier() #7.选择分类器 2分knc.fit(X_train,Y_train)y_predict=knc.predict(X_test)#8.预测结果 3分print("预测结果:")print(y_predict)from sklearn.metrics import classification_reportprint("Accuracy of K-Nearest Neighbor Classifier is",knc.score(X_test,Y_test))#9.预测性能评估 2分print(classification_report(Y_test,y_predict,target_names=['Benign','Malignant']))
朴素贝叶斯分类器
from sklearn.datasets import fetch_20newsgroupsnews=fetch_20newsgroups(subset='train')#1.设置数据路径 1分print(len(news.data))print(news.data[0])#2.打印数据信息 1分from sklearn.model_selection import train_test_splitX_train,X_test,Y_train,Y_test=train_test_split(news.data,news.target, test_size=0.2,random_state=33)#3.对数据集进⾏划分 4分from sklearn.feature_extraction.text import CountVectorizervec=CountVectorizer() #4.⽂本向量化 1分X_train=vec.fit_transform(X_train) #5.训练集处理 1分X_test=vec.transform(X_test) #6.测试集处理 1分from sklearn.naive_bayes import MultinomialNB#7.导⼊朴素⻉叶斯 8分mnb=MultinomialNB()#8.朴素⻉叶斯模型建⽴ 4分mnb.fit(X_train,Y_train)#9.朴素⻉叶斯模型训练 2分y_predict=mnb.predict(X_test)#10.模型结果预测 2分print(y_predict)from sklearn.metrics import classification_reportprint("The accuracy of Naive Bayes Classification is",mnb.score)#11.accuracy结果输出 1分print(classification_report(Y_test,y_predict,target_names=news.target_names))#12.classification_report报告输出 1分
Class Two
线性回归模型
from sklearn.linear_model import LinearRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_error,mean_absolute_errorimport pandas as pdimport matplotlib.pyplot as plt# 1、加载数据集 2分data = pd.read_excel('./data/⽔质参数.xlsx')# 2、查看数据统计特性 2分print(data.info())data.head()# 3、分离输⼊数据和标签数据 4分 y = data['溶解氧(DO)'].valuesx = data.drop(columns=['溶解氧(DO)'])# 4、划分数据 3分x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)# 5、归⼀化处理 4分std_x = StandardScaler()x_train = std_x.fit_transform(x_train)x_test = std_x.transform(x_test)# 6、模型构造和训练 4 分lr = LinearRegression()lr.fit(x_train, y_train)# 7、模型预测 2分y_lr_predict = lr.predict(x_test)# 模型评估结果print("lr的均⽅误差为:", mean_squared_error(y_test,y_lr_predict)) #8、均⽅误差 4分print("lr的平均绝对误差为:", mean_absolute_error(y_test,y_lr_predict)) #9、平均绝对误差 4分# 结果可视化plt.plot(y_test, 'r', label='true_data') #10 红⾊ 2分plt.plot(y_lr_predict, 'b', label='predict')#11 蓝⾊ 2分plt.legend()plt.show()
支持向量机回归模型
from sklearn.model_selection import train_test_splitfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as pltdataset = load_boston()print(dataset.DESCR) x = dataset.data # 1.导⼊所有特征变量 1分 y = dataset.target # 2.导⼊⽬标值(房价) 1分x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33)from sklearn.preprocessing import StandardScalerss_x = StandardScaler() # 3.数据进⾏归⼀化处理 1分ss_y = StandardScaler() # 4.数据进⾏归⼀化处理 1分x_train = ss_x.fit_transform(x_train)x_test = ss_x.transform(x_test)y_train = ss_y.fit_transform(y_train.reshape(-1, 1)) # 5.数据进⾏归⼀化处理 1分y_test = ss_y.transform(y_test.reshape(-1, 1))# 6.数据进⾏归⼀化处理 1分print(x_train)from sklearn.svm import SVRlinear_svr = SVR(kernel='linear') # 7.选择线性核函数配置的⽀持向量 2分linear_svr.fit(x_train, y_train) # 8.线性核函数配置的⽀持向量机进⾏回归训练 2分linear_svr_y_predict = linear_svr.predict(x_test) # 9.测试样本进⾏预测 2分rbf_svr = SVR(kernel='rbf') # 10.选择径向基核函数配置的⽀持向量机 2分rbf_svr.fit(x_train, y_train) # 11.径向基核函数配置的⽀持向量机进⾏回归训练 2分rbf_svr_y_predict = rbf_svr.predict(x_test) # 12.测试样本进⾏预测 2分from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error# 13.反归⼀化ytre1=ss_y.inverse_transform(y_test) # 1分ytre2=ss_y.inverse_transform(y_test) # 1分ypre1=ss_y.inverse_transform(linear_svr_y_predict) # 1分ypre2=ss_y.inverse_transform(rbf_svr_y_predict) # 1分# 14.求预测值和真实值的均⽅误差print ('The mean squared error of linear SVR is', mean_squared_error(ytre1, ypre1)) # 2分print ('The mean absoluate error of linear SVR is', mean_absolute_error(ss_y.inverse_transform(y_test), ypre1)) # 2分print ('The mean squared error of RBF SVR is', mean_squared_error(ytre2, ypre2)) # 2分print ('The mean absoluate error of RBF SVR is', mean_absolute_error(ytre2, ypre2)) # 2分# 可视化线性核函数配置的⽀持向量预测结果plt.plot(ypre1, label='pre')plt.plot(ytre1, label='true')plt.legend()plt.show()# 可视化径向基核函数配置的⽀持向量预测结果plt.plot(ypre2, label='pre')plt.plot(ytre2, label='true')plt.legend()plt.show()
多层感知器
from sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltfrom sklearn.metrics import r2_score, mean_absolute_error, mean_squared_errorfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.neural_network import MLPRegressorimport pandas as pdimport numpy as np#正确新建项⽬并建⽴Python⽂件并导⼊包(题1-2,两个步骤各⼀分)dir = './data/data2.xlsx' #1.正确读取⽂件 1分data = pd.read_excel(dir)#2.正确读取⽂件 1分data.info()#3.正确读取⽂件 1分 y = data['溶解氧'].valuesx = data.drop(['溶解氧'], axis=1).valuesx_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)#4.正确划分数据集 2分ss_x = MinMaxScaler() # 5.数据进⾏归⼀化处理 1分ss_y = MinMaxScaler() # 6.数据进⾏归⼀化处理 1分x_train = ss_x.fit_transform(x_train) #7.数据进⾏归⼀化处理 1分x_test = ss_x.transform(x_test)y_train = ss_y.fit_transform(y_train.reshape(-1, 1))y_test = ss_y.transform(y_test.reshape(-1, 1))#8.数据进⾏归⼀化处理 1分clf = MLPRegressor(max_iter=500)#9.正确初始化多层感知器 5分clf.fit(x_train,y_train)#10.成功进⾏训练 3分#11.成功进⾏预测predict = clf.predict(x_test)#1分predict = np.array(predict).reshape(len(predict),1)#2分#12.正确评估均⽅误差ytre = ss_y.inverse_transform(y_test)#1分ypre = ss_y.inverse_transform(predict)#1分print('The mean squared error of linear MLP is', mean_squared_error(ytre, ypre))#1分print('The mean absolute error of linear MLP is', mean_absolute_error(ss_y.inverse_transform(y_test), ypre))#2分#13正确绘制曲线图plt.plot(ypre, label='pre') #1分plt.plot(ytre, label='true')#1分plt.legend()# 1分plt.show()# 1分
Class Three
卷积神经网络模型
import numpy as npfrom PIL import Imagefrom keras.preprocessing.image import ImageDataGeneratorfrom keras.models import Sequential, Modelfrom keras.layers import Dropout, Flatten, Densefrom keras.layers import Conv2D, MaxPooling2Dfrom tensorflow.keras.optimizers import RMSprop, Adam, SGDfrom keras.callbacks import ModelCheckpointfrom keras import applicationsfrom keras import optimizerstrain_data_dir = './data/3.1/train'#1.设置数据路径 1分ClassNames = ['cat','dog'] #2.设置标签名称 1分(可选参数,为⼦⽂件夹的列表)batchsize = 4 #3.设置batchsize⼤⼩ 1分generator = ImageDataGenerator( rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, zoom_range=0.1, rescale=1.0/255 #4.将图⽚的像素值归⼀化到0~1 1分? rescale的作⽤是对图⽚的每个像素值均乘上这个放缩因⼦ )train = generator.flow_from_directory( train_data_dir, target_size=(224,224),#5.调整图像分辨率 1分 classes=ClassNames, shuffle=False, class_mode='binary', # 'categorical',#6.选择标签模式 1分 ? batch_size=batchsize)#7.模型建⽴model = Sequential()model.add(Conv2D(4,(5,5),activation='relu',padding='same',input_shape=(224, 224, 3))) # 1分model.add(MaxPooling2D(pool_size=(2,2))) # 1分model.add(Conv2D(8,(3,3),padding='same',activation='relu')) # 1分model.add(MaxPooling2D(pool_size=(2, 2))) # 1分model.add(Conv2D(16,(3,3),padding='same',activation='relu')) # 1分model.add(MaxPooling2D(pool_size=(2, 2))) # 1分model.add(Flatten()) # Flatten层⽤来将输⼊“压平”,即把多维的输⼊⼀维化,常⽤在从卷积层到全连接层的过渡。model.add(Dense(128,activation='relu')) # 1分model.add(Dense(1,activation='sigmoid')) # 1分rmsprop = RMSprop(learning_rate=0.0001)#8.设置学习率 2分model.compile(loss='binary_crossentropy',optimizer=rmsprop,metrics=['accuracy'])#9.选择损失函数,优化器 2分#10.编写回调函数,在训练时⽤于保存准确率最⾼模型为weight.hbest_model = ModelCheckpoint(filepath='weight.h',monitor='accuracy',verbose=1, save_best_only=True) # 2分model.summary()model.fit_generator( train, steps_per_epoch = 400//batchsize, epochs=15, #11.设置迭代次数 2分 callbacks=[best_model])
VGG模型
from keras.preprocessing.image import ImageDataGeneratorfrom keras.models import Sequential, Modelfrom keras.layers import Dropout, Flatten, Densefrom tensorflow.keras.optimizers import RMSprop, Adam, SGD, Adadeltafrom keras.callbacks import ModelCheckpointfrom keras.applications.vgg16 import VGG16, preprocess_input
train_data_dir = './data/3.2/train'#1.设置数据路径 1分ClassNames = ['crab','shrimp']#2.设置标签名称 1分batchsize = 8 #3.设置batchsize⼤⼩ 1分generator = ImageDataGenerator( rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, zoom_range=0.1, rescale=1.0/255 #4.将图⽚的像素值归⼀化到0~1 1分 )train = generator.flow_from_directory( train_data_dir, target_size=(224,224),#5.调整图像分辨率 1分 classes=ClassNames, shuffle=False, class_mode='binary',#'categorical',#6.选择标签模式 1分 batch_size=batchsize)
#7.模型建⽴ 参数:模型权重使⽤imagenet的,不包括顶层,输⼊图像尺⼨vgg16 = VGG16(weights='imagenet',include_top=False,input_shape=(224, 224, 3)) # 1分for layer in vgg16.layers: layer.trainable = False # 1分
import tensorflowlast = vgg16.output # 1分 x = Flatten()(last)# 1分 x = Dense(256,activation='relu')(x) # 1分 x = Dropout(0.5)(x) x = Dense(256,activation='relu')(x) # 1分 x = Dropout(0.5)(x) x = Dense(1,activation='sigmoid')(x) # 1分model = Model(inputs=vgg16.input,outputs=x) # 1分
best_model = ModelCheckpoint(filepath='weight.h',monitor='accuracy',verbose=1, save_best_only=True) # 8.保存准确率最⾼的模型 2分adadelta = Adadelta()#9.建⽴adadelta模型优化器 2分model.compile(loss='binary_crossentropy',optimizer=adadelta,metrics=['accuracy'])#10.选择损失函数,优化器 2分model.summary()model.fit_generator( train, steps_per_epoch=600//batchsize, epochs=2,#11.设置迭代次数 2分 callbacks=[best_model])
LSTM模型
import mathimport keras as Kfrom pandas import DataFramefrom pandas import concatfrom numpy import concatenatefrom pandas import read_csvimport numpy as npfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.preprocessing import LabelEncoderimport matplotlib.pyplot as pyplotfrom keras.models import Sequentialfrom keras.layers import Densefrom keras.layers import LSTMimport sklearn.metricsfrom sklearn.metrics import mean_squared_errorfrom sklearn.metrics import median_absolute_errorfrom keras.layers import Dropoutdataset = read_csv('./data/3.3/data.csv')#1.设置数据读取参数 2分dataset.info()values = dataset.valuesvalues = values.astype('float32')# 2.将数据转化为float格式 2分# 归⼀化scaler = MinMaxScaler(feature_range=(0, 1))#3.建⽴归⼀化函数 2分 或不填,默认就是这个scaled = scaler.fit_transform(values)#4.归⼀化处理 2分# 框架作为监督学习value2 = scaledn_train_hours = int(len(dataset) * 0.8) k=int(len(dataset) * 0.9)train = value2[:n_train_hours, :]test = value2[n_train_hours:k, :]cheak = value2[k:, :]
# 建⽴输⼊和输出集train_X, train_y = train[:, :-1], train[:, -1]#5.建⽴训练集输⼊输出集 2分test_X, test_y = test[:, :-1], test[:, -1] #6.建⽴测试集集输⼊输出集 2分cheak_X, cheak_y = cheak[:, :-1], cheak[:, -1]# 将输⼊集转化为3维train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))cheak_X = cheak_X.reshape((cheak_X.shape[0], 1, cheak_X.shape[1]))
# 7.构建神经⽹络模型model = Sequential()model.add(LSTM(50,input_shape=(train_X.shape[1], train_X.shape[2]))) # 1分model.add(Dense(1)) # 1分model.compile(loss='mae',optimizer='adam') # 1分# 调整神经⽹络history = model.fit( train_X, train_y, epochs=10, # 8.设置最⼤迭代次数 1分 batch_size=50, # 9.设置batchsize⼤⼩ 1分 validation_data=(test_X, test_y), verbose=2, shuffle=False)pyplot.plot(history.history['loss'],label='loass')#10.绘制损失值曲线 1分pyplot.legend()pyplot.show()
# 进⾏预测ycheak = model.predict(cheak_X)#11.对测试集进⾏预测 2分cheak_X = cheak_X.reshape((cheak_X.shape[0], cheak_X.shape[2])) #
inv_cheak = concatenate((cheak_X[:, 0:],ycheak), axis=1) # 与预测值合并,按列⽅向inv_cheak = scaler.inverse_transform(inv_cheak)inv_cheak =inv_cheak[:,6]cheak_y = cheak_y.reshape((len(cheak_y), 1))
inv_c = concatenate((cheak_X[:, 0:],cheak_y), axis=1) # 与真实值合并,按列⽅向inv_c = scaler.inverse_transform(inv_c)inv_c = inv_c[:,6]pyplot.plot(inv_cheak)#12.绘制预测曲线 2分pyplot.plot(inv_c)#13.绘制实际曲线 2分pyplot.show()