diff --git a/doc/software-compatibility/dist/oepkgs.png b/doc/software-compatibility/dist/oepkgs.png new file mode 100644 index 0000000000000000000000000000000000000000..e1e48ed6fb1643a004b47f2d28ca68dfe1723852 Binary files /dev/null and b/doc/software-compatibility/dist/oepkgs.png differ diff --git "a/doc/software-compatibility/rpm\346\236\204\345\273\272\344\273\245\345\217\212\345\273\272\344\273\223\346\265\201\347\250\213.md" "b/doc/software-compatibility/rpm\346\236\204\345\273\272\344\273\245\345\217\212\345\273\272\344\273\223\346\265\201\347\250\213.md" index d111c6fb6827296ee284d5142da8116e98631dea..d72fd2d18e37ff1ed1d841247ed07b3ce55dbe7d 100644 --- "a/doc/software-compatibility/rpm\346\236\204\345\273\272\344\273\245\345\217\212\345\273\272\344\273\223\346\265\201\347\250\213.md" +++ "b/doc/software-compatibility/rpm\346\236\204\345\273\272\344\273\245\345\217\212\345\273\272\344\273\223\346\265\201\347\250\213.md" @@ -6,9 +6,9 @@ 3. 仓库的webhook将自动触发构建任务 #### 一、基于PR,创建仓库 -在[oepkgs-management](https://gitee.com/oepkgs/oepkgs-management)仓库提PR(如何提PR,详见文档最后的[QA](##QA)),填写两个配置文件,PR合入之后,创仓机器人ci-rebot会在[src-oepkgs](https://gitee.com/src-oepkgs)下面自动创建仓库。 +在[oepkgs-management](https://gitee.com/oepkgs/oepkgs-management)仓库提PR(如何提PR,详见文档最后的[QA](##QA)),填写两个配置文件,PR合入之后,创仓机器人ci-robot会在[src-oepkgs](https://gitee.com/src-oepkgs)下面自动创建仓库。 -oepkgs-management仓库中的两个配置文件(以qemu为例): +oepkgs-management仓库中的两个配置文件(以nginx为例): ``` # 在oepkgs-management仓库sig目录下面创建虚拟化领域的sig组 # 创建oepkgs-management/sig/virtual/sig-info.yaml文件 @@ -29,19 +29,19 @@ maintainers: # 该sig组管理的仓库 repositories: - repo: - - src-oepkgs/qemu + - src-oepkgs/nginx ``` ``` # 在oepkgs-management/sig/virtual下面创建src-oepkgs/仓库名称首字母/仓库名称.yaml # ci-rebot将依据这个文件进行自动建仓 -oepkgs-management/sig/virtual/src-oepkgs/q/qemu.yaml: +oepkgs-management/sig/virtual/src-oepkgs/q/nginx.yaml: # 仓库名称 -name: qemu -description: "QEMU is a generic and open source processor emulator which achieves a good emulation speed by using dynamic translation" +name: nginx +description: "nginx is a generic and open source processor emulator which achieves a good emulation speed by using dynamic translation" # 仓库地址 -upstream: https://gitee.com/src-oepkgs/qemu +upstream: https://gitee.com/src-oepkgs/nginx # 仓库分支 branches: - name: master @@ -55,9 +55,9 @@ branches: type: public ``` #### 二、补充源码文件 -完成步骤一之后,5分钟内会生成https://gitee.com/src-oepkgs/qemu仓库,通过PR往这个仓库中补充源码文件: +完成步骤一之后,5分钟内会生成 https://gitee.com/src-oepkgs/nginx 仓库,通过PR往这个仓库中补充源码文件: -分别是可用于支撑生成rpm包的qemu.spec文件、软件包源码包qemu-2.12.0.tar.bz2,详见:https://gitee.com/src-oepkgs/qemu +分别是可用于支撑生成rpm包的nginx.spec文件、软件包源码包nginx-2.12.0.tar.bz2,详见:https://gitee.com/src-oepkgs/nginx 提了PR之后,在5~30分钟时间内,会进行PR门禁构建测试,PR会评论出PR构建结果,建议在**Build_Result**显示为**SUCCESS**之后合入PR @@ -92,7 +92,7 @@ suite: rpmbuild # 测试用例 ~/lkp-test/tests/rpmbuild rpmbuild: # 远程仓库源码包地址 -upstream_repo: https://gitee.com/src-oepkgs/qemu +upstream_repo: https://gitee.com/src-oepkgs/nginx # 构建后的包仓库位置,都将放置在contrib/$sig仓库中 custom_repo_name: contrib/virtual @@ -113,9 +113,7 @@ os_version: $upstream_branch # docker_image: $upstream_branch ``` #### 2. 查看日志判断是否构建成功 -###### 2.1 可通过job_id来查看日志(该job_id之后将由门禁系统,以评论形式评论至仓库PR中,目前暂无) - https://compass-ci.openeuler.org/jobs - +###### 2.1 可通过构建工程系统,以评论形式评论至仓库PR中 #### 3. rpmbuild脚本 在submit rpmbuild.yaml 时,测试用例**rpmbuild**会去引用脚本 @@ -125,8 +123,7 @@ os_version: $upstream_branch 如果构建成功,则通过upload_rpm_pkg函数先将测试机上打好的软件包放入```/srv/rpm/upload```,再通过update_repo_mq处理上传的软件包。处理完的包会先放入/srv/rpm/testing中,每天零点定时更新到/srv/rpm/pub中,也就是https://repo.oepkgs.net/openEuler/rpm/仓库中 #### 4. 测试构建的包能否正常安装 -###### 4.1 可以查看job_id(自动构建任务,无需提交,可通过job_id来查看日志,该job_id之后将由门禁系统,以评论形式评论至仓库PR中,目前暂无) -https://compass-ci.openeuler.org/jobs +###### 4.1 可以查看自动构建任务,无需提交,以评论形式评论至仓库PR中) ###### 4.2 手动提交install.yaml 需要加入以下参数 @@ -174,9 +171,9 @@ mount_repo_name: compatible/c7 ### 如何查询软件包位置? -[https://compass-ci.openeuler.org/oepkgs](https://compass-ci.openeuler.org/oepkgs) +[https://search.oepkgs.net/](https://search.oepkgs.net/) 可在此查询引入到软件所的软件包 ### 如何下载使用仓库中的软件包? -在[https://compass-ci.openeuler.org/oepkgs](https://compass-ci.openeuler.org/oepkgs) -查询软件包在软件所中的仓库存放位置之后,详见[openEuler社区开源软件适配流程.md](https://gitee.com/openeuler/oec-application/blob/master/doc/openEuler%E7%A4%BE%E5%8C%BA%E5%BC%80%E6%BA%90%E8%BD%AF%E4%BB%B6%E9%80%82%E9%85%8D%E6%B5%81%E7%A8%8B.md)的最后一节:**下载使用软件**,修改这一节中的示例中的**baseurl**即可。 +在[https://search.oepkgs.net/](https://search.oepkgs.net/) +查询软件包在软件所中的仓库存放位置之后,点开软件包的详情页,按照安装指引便可下载使用软件包。 diff --git "a/doc/\345\214\227\345\220\221\345\274\200\346\272\220\350\275\257\344\273\266\345\214\205\351\200\202\351\205\215\350\277\201\347\247\273\350\257\246\347\273\206\346\214\207\345\257\274.md" "b/doc/\345\214\227\345\220\221\345\274\200\346\272\220\350\275\257\344\273\266\345\214\205\351\200\202\351\205\215\350\277\201\347\247\273\350\257\246\347\273\206\346\214\207\345\257\274.md" index 1e3e20e96412a8219ccc11948537c484c4f73ca8..799f21df7cd0ff61b3fc0670dd45d723ceb3c0b4 100644 --- "a/doc/\345\214\227\345\220\221\345\274\200\346\272\220\350\275\257\344\273\266\345\214\205\351\200\202\351\205\215\350\277\201\347\247\273\350\257\246\347\273\206\346\214\207\345\257\274.md" +++ "b/doc/\345\214\227\345\220\221\345\274\200\346\272\220\350\275\257\344\273\266\345\214\205\351\200\202\351\205\215\350\277\201\347\247\273\350\257\246\347\273\206\346\214\207\345\257\274.md" @@ -1,58 +1,98 @@ -## 背景介绍 +[TOC] + +### 背景介绍 [oepkgs](https://oepkgs.net/zh/) 全称开放软件包服务(Open External Packages Service),是一个为 openEuler 以及其他 Linux 发行版提供软件包服务和容器镜像服务的第三方社区。 oepkgs 社区提供两种开源软件包适配方式,第一种方式开源软件包的源码合入 [src-oepkgs](https://gitee.com/src-oepkgs) 组织仓下面,由 src-oepkgs 的构建服务对软件包进行构建 测试,兼容性测试,并进入 oepkgs 的[主体仓库](https://repo.oepkgs.net/openEuler/rpm/)中。另一种方式用户通过网页快速构建软件包,软件包进入个人账户下面的某个仓库中。 +![输入图片说明](./software-compatibility/dist/oepkgs.png) ### 开源软件引入oepkgs主仓总体流程 ->**1. 获取到spec文件以及源码文件** +>**1. 初始化RPM编译环境** >**2. 在openEuler上进行编译构建** >**3. 在openEuler上进行兼容性测试** ->**4. 将已经适配好的spec文件以及源码文件存放在src-oepkgs仓库中(建仓流程详见[rpm包构建及建仓流程](https://gitee.com/openeuler/oec-application/blob/master/doc/software-compatibility/rpm%E6%9E%84%E5%BB%BA%E4%BB%A5%E5%8F%8A%E5%BB%BA%E4%BB%93%E6%B5%81%E7%A8%8B.md))** -### 1. 软件包spec及源码文件获取 +>**4. 使用src-oepkgs社区构建工程** -**1.1 在一些网站上找到软件包的src.rpm包,解压获取spec文件以及软件包的源码文件:** +#### 1. 初始化RPM编译环境 +执行命令,安装构建工具: ``` -https://pkgs.org/ -https://src.fedoraproject.org/projects/rpms/* -https://koji.fedoraproject.org/koji/packages -www.google.com -www.baidu.com -www.bing.com +yum install -y dnf-plugins-core rpm-build ``` -以libvirt 4.5.0版本引入为例,在网上寻找src.rpm包的流程如下图所示: -![输入图片说明](./software-compatibility/dist/image.png) -![输入图片说明](./software-compatibility/dist/image2image.png) +生成目录结构: ``` -rpm -i http://vault.centos.org/7.9.2009/os/Source/SPackages/libvirt-4.5.0-36.el7.src.rpm +# 输入任意 **.spec,这一步报错,此时将自动生成目录 +rpmbuild -ba nginx.spec +error: failed to stat /root/nginx.spec: No such file or directory +# 查看自动生成的目录结构 +ls ~/rpmbuild/ +BUILD BUILDROOT RPMS SOURCES SPECS SRPMS ``` -![输入图片说明](./software-compatibility/dist/image3image.png) -![输入图片说明](./software-compatibility/dist/image4image.png) -如上图所示的```~/rpmbuild/SPECS/``` 和 ```~/rpmbuild/SOURCES/```目录下面分别存放了软件包的spec文件以及软件包的源码文件 - -### 2. 在openEuler上进行编译构建: -执行命令,安装构建工具: +准备软件源码到SOURCES ``` -yum install -y dnf-plugins-core rpm-build +wget http://nginx.org/download/nginx-1.20.1.tar.gz +cp nginx-1.20.1.tar.gz ~/rpmbuild/SOURCES/ +``` +创建修改SPEC配置文件 +``` +编写后缀为.spec的文件 +vim ~/rpmbuild/SPECS/nginx.spec +Name: nginx +Version: 1.20.1 +Release: 10 +Summary: Nginx is a web server. +License: GPL +Group: Productivity/Networking/Web/Proxy +URL: test.rpm.com +Source0: nginx-1.20.1.tar.gz +BuildRequires: gcc +BuildRequires: pcre2-devel +BuildRequires: pcre-devel +BuildRequires: openssl-devel +BuildRequires: gdb-headless + +%description +Building a nginx-1.20.1.rpm from nginx-1.20.1.tar.gz + +%post +useradd nginx + +%prep +%setup -q + +%build +./configure +make %{?_smp_mflags} + +%install +make install DESTDIR=%{buildroot} + +%files +%doc +/usr/local/nginx/* + +%changelog +* Sat Dec 06 2022 liping - 1.20.1 - 10 +- Release Nginx 1.20.1 ``` +#### 2. 在openEuler上进行编译构建: 执行命令,安装软件包的依赖包 ``` # yum-builddep -y ~/rpmbuild/SPECS/*.spec -yum-builddep -y ~/rpmbuild/SPECS/libvirt.spec +yum-builddep -y ~/rpmbuild/SPECS/nginx.spec ``` 执行命令,对软件包进行编译构建 ``` # rpmbuild -ba ~/rpmbuild/SPECS/*.spec -rpmbuild -ba ~/rpmbuild/SPECS/libvirt.spec +rpmbuild -ba ~/rpmbuild/SPECS/nginx.spec ``` 编译构建通过就会在 ~/rpmbuild/RPMS/ 目录下面生成 rpm包 ``` ls ~/rpmbuild/RPMS/* ``` -### 3. 在openEuler上进行兼容性测试 +#### 3. 在openEuler上进行兼容性测试 执行命令,测试软件包的安装、卸载 ``` yum localinstall ~/rpmbuild/RPMS/x86_64/* @@ -63,9 +103,17 @@ yum remove * systemctl start * systemctl stop * ``` -### 4. 将已经适配好的软件包的spec文件以及~/rpmbuild/SOURCE目录下面的源码文件存放在src-oepkgs仓库中(建仓流程详见[rpm包构建及建仓流程](https://gitee.com/openeuler/oec-application/blob/master/doc/software-compatibility/rpm%E6%9E%84%E5%BB%BA%E4%BB%A5%E5%8F%8A%E5%BB%BA%E4%BB%93%E6%B5%81%E7%A8%8B.md)) +#### 4. 使用src-oepkgs构建工程完成软件包上传 + +将软件包的spec文件以及~/rpmbuild/SOURCE目录下面的源码文件存放在src-oepkgs仓库中 + +建仓流程详见[rpm包构建及建仓流程](https://gitee.com/openeuler/oec-application/blob/master/doc/software-compatibility/rpm%E6%9E%84%E5%BB%BA%E4%BB%A5%E5%8F%8A%E5%BB%BA%E4%BB%93%E6%B5%81%E7%A8%8B.md)) + + +### 开源软件引入oepkgs个人仓总体流程 + +build.dev.oepkgs.net 构建总体流程 -## build.oepkgs.net 构建总体流程 >**1. 创建个人软件包仓库** >**2. 创建并提交构建任务** @@ -73,14 +121,14 @@ systemctl stop * >**3. 查看并分析构建日志** >**4. 在个人仓库中下载使用软件包** -### 1. 创建个人软件包仓库 +#### 1. 创建个人软件包仓库 在rpm包构建之前,我们可以先选择一个已有的软件包仓库地址或新增一个软件包仓库地址去存放我们待构建的软件包。 切换到构建页面,选择 RPM构建 ---> 仓库管理 ---> 新增仓库 ![输入图片说明](./software-compatibility/dist/storageimage.png) -### 2. 新建一个构建任务 +#### 2. 新建一个构建任务 通过提交构建任务,编译构建出软件包,并发布到上一步骤创建的仓库中。 ![输入图片说明](./software-compatibility/dist/buildtask.png) @@ -95,7 +143,7 @@ systemctl stop * ![输入图片说明](./software-compatibility/dist/8d91fc9db3681d3b367febf82cb83ee.png) -### 3. 查看构建日志 +#### 3. 查看构建日志 ![输入图片说明](./software-compatibility/dist/0474384023b26c8481351fc61236064.png) ![输入图片说明](./software-compatibility/dist/76ddf9290d4b2a2699f9e924d91e735.png) diff --git a/tools/data_collection/README.md b/tools/data_collection/README.md index a95084ce4f1046b8cb5620f50657bfd544546081..71bfa4258405a42983cecb84192d0e10387c01f6 100644 --- a/tools/data_collection/README.md +++ b/tools/data_collection/README.md @@ -83,3 +83,21 @@ dnf install ... } ``` + +## get_ecology_compatiable_list.py + +### 用法 + +``` +python3 get_ecology_compatiable_list.py + 将https://ecology.chinauos.com上的兼容性清单导出到excel表格: ecology_uos_compatiable_list.xls +``` + +## get_kylinos_compatiable_list.py + +### 用法 + +``` +python3 get_kylinos_compatiable_list.py + 将https://eco.kylinos.cn上的兼容性清单导出到excel表格:kylinos_compatiable_list.xls +``` diff --git a/tools/data_collection/get_docker_info.py b/tools/data_collection/get_docker_info.py new file mode 100644 index 0000000000000000000000000000000000000000..c8886a70967186c30455ab75401b4ebf39458dd4 --- /dev/null +++ b/tools/data_collection/get_docker_info.py @@ -0,0 +1,88 @@ +# coding=utf-8 +import json +import openpyxl +import requests +import re + +INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') +FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]') +FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)') + +class Spider: + def __init__(self): + self.url = "https://hub.docker.com/api/content/v1/products/search?operating_system=linux&page_size=25&q=" + self.response = [] + + def pull_all_data(self): + payload = {} + headers = { + 'Accept': 'application/json', + 'Search-Version': 'v3' + } + response = json.loads(requests.request("GET", self.url, headers=headers, data=payload).text) + page_size = response["page_size"] + numFound = 100 + page_num = numFound // page_size + for i in range(1, page_num+1): + url = "https://hub.docker.com/api/content/v1/products/search?operating_system=linux&page_size=25&page=" + str(i) + payload = {} + headers = { + 'Accept': 'application/json', + 'Search-Version': 'v3' + } + response = json.loads(requests.request("GET", url, headers=headers, data=payload).text) + next_url = "" + for item in response['summaries']: + name = item['name'] + if name.split("/")[0] == item['name']: + name = "-".join([i.lower() for i in name.split(" ")]) + next_url = "https://hub.docker.com/v2/repositories/library/" + name + else: + next_url = "https://hub.docker.com/v2/repositories/" + name.split('/')[0] + "/" + name.split('/')[1] + "/" + item["dockerhub_url"] = next_url + info = self.get_more_information(next_url) + dictMerged2 = dict(item, **info) + self.response.append(dictMerged2) + + def get_more_information(self,url): + response = json.loads(requests.request("GET", url).text) + links = {} + if response.get("full_description"): + links = self.find_md_links(response['full_description']) + return links + + def find_md_links(self, md): + """ Return dict of links in markdown """ + + links = dict(INLINE_LINK_RE.findall(md)) + footnote_links = dict(FOOTNOTE_LINK_TEXT_RE.findall(md)) + footnote_urls = dict(FOOTNOTE_LINK_URL_RE.findall(md)) + + for key, value in footnote_links.items(): + footnote_links[key] = footnote_urls[value] + links.update(footnote_links) + urls = {} + url_list = [] + for i in links.values(): + if "Dockerfile" in i: + url_list.append(i) + urls["urls"] = "\n".join(url_list) + + return urls + + def write_to_xlsx(self): + wb = openpyxl.Workbook() + sheet = wb.active + + for col, head in enumerate(self.response[0].keys()): + sheet.cell(row=1, column=col + 1, value=head) + + for row, sheet_row in enumerate(self.response): + for col, row_key in enumerate(sheet_row.keys()): + sheet.cell(row=row + 2, column=col + 1, value=str(sheet_row[row_key])) + wb.save(filename="docker_top_100.xlsx") + +if __name__ == '__main__': + sp = Spider() + sp.pull_all_data() + sp.write_to_xlsx() diff --git a/tools/data_collection/get_ecology_compatiable_list.py b/tools/data_collection/get_ecology_compatiable_list.py new file mode 100644 index 0000000000000000000000000000000000000000..c5dc1086f316913722c0dfa8dee2631a70a38773 --- /dev/null +++ b/tools/data_collection/get_ecology_compatiable_list.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import urllib3 +import json +import xlwt + +class EcologyUos: + def __init__(self): + self.all_data = {} + + def pull_all_data(self): + http = urllib3.PoolManager(10) + req = http.request('GET', 'https://ecology.chinauos.com/analysis/jingpin/search?page=1&limit=10&status=done&framework=all&url=&query=') + + encoded_data = json.loads(req.data.decode('utf-8')) + count = encoded_data["count"] + url = "https://ecology.chinauos.com/analysis/jingpin/search?page=1&limit=" + str(count) + "&status=done&framework=all&url=&query=" + req_all_data_hash = json.loads(http.request('GET', url).data.decode('utf-8')) + self.all_data = req_all_data_hash + + + def write_to_xlsx(self): + sheet_data = self.all_data["data"] + workbook = xlwt.Workbook(encoding='utf-8') + sheet1 = workbook.add_sheet("统信兼容性清单") + for col, head in enumerate(sheet_data[0].keys()): + sheet1.write(0, col, head) + + for row, sheet_row in enumerate(sheet_data): + for col, row_key in enumerate(sheet_row.keys()): + sheet1.write(row+1, col, sheet_row[row_key]) + + workbook.save('./ecology_uos_compatiable_list.xls') + +if __name__ == '__main__': + eu = EcologyUos() + eu.pull_all_data() + eu.write_to_xlsx() diff --git a/tools/data_collection/get_kylinos_compatiable_list.py b/tools/data_collection/get_kylinos_compatiable_list.py new file mode 100644 index 0000000000000000000000000000000000000000..94adacc219856c2a84b41dfef3bed723d676a16e --- /dev/null +++ b/tools/data_collection/get_kylinos_compatiable_list.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import urllib3 +import json +import openpyxl +from queue import Queue +import threading + +''' +整体的思路: +1、构造任务队列pageQueue ,存放所有要爬取的页面url +2、用多线程爬虫,然后将抓取的页面内容存放到all_data中 +3、然后存放到的xlsx文件中 +''' + +class Crawl_thread(threading.Thread): + ''' + 抓取线程类,注意需要继承线程类Thread + ''' + def __init__(self,thread_id,queue): + threading.Thread.__init__(self) # 需要对父类的构造函数进行初始化 + self.thread_id = thread_id + self.queue = queue # 任务队列 + + def run(self): + ''' + 线程在调用过程中就会调用对应的run方法 + :return: + ''' + print('启动线程:',self.thread_id) + self.crawl_spider() + print('退出了该线程:',self.thread_id) + + def crawl_spider(self): + while True: + if self.queue.empty(): #如果队列为空,则跳出 + break + else: + page = self.queue.get() + print("第 %d 页" %(page)) + print('当前工作的线程为:',self.thread_id," 正在采集:",page) + try: + http = urllib3.PoolManager(10) + url = "https://eco.kylinos.cn/home/compatible/index.html?system_class=1" \ + "&system_id=&small_version_id=&is_plan=0&page=" + str(page) + "&limit=20" + req_data = json.loads(http.request('GET', url).data.decode('utf-8'))["data"] + all_data.extend(req_data) + print("数据队列长度:",len(all_data),end="\n") + except Exception as e: + print('采集线程错误',e) + +all_data = [] +def main(): + http = urllib3.PoolManager(10) + req = http.request('GET', 'https://eco.kylinos.cn/home/compatible/index.html?' + 'system_class=1&system_id=&small_version_id=&is_plan=0&page=1&limit=20') + + encoded_data = json.loads(req.data.decode('utf-8')) + count = encoded_data["count"] + page_num = (count // 20) + 1 + pageQueue = Queue(count) # 任务队列,存放网页的队列 + for page in range(1, page_num+1): + pageQueue.put(page) # 构造任务队列 + # 初始化采集线程 + crawl_threads = [] + crawl_name_list = ['crawl_' + str(i) for i in range(1, 6)] # 总共构造5个爬虫线程 + for thread_id in crawl_name_list: + thread = Crawl_thread(thread_id, pageQueue) # 启动爬虫线程 + thread.start() # 启动线程 + crawl_threads.append(thread) + + # 等待队列情况,先进行网页的抓取 + while not pageQueue.empty(): + # 不为空,则继续阻塞 + pass + + # 等待所有线程结束 + for t in crawl_threads: + t.join() + + wb = openpyxl.Workbook() + sheet = wb.active + + for col, head in enumerate(all_data[0].keys()): + sheet.cell(row=1, column=col+1, value=head) + + for row, sheet_row in enumerate(all_data): + for col, row_key in enumerate(sheet_row.keys()): + sheet.cell(row=row+2, column=col+1, value=sheet_row[row_key]) + wb.save(filename="kylinos_compatiable_list.xlsx") + +if __name__ == '__main__': + main() diff --git a/tools/data_collection/get_suse_hardware_compatiable_list.py b/tools/data_collection/get_suse_hardware_compatiable_list.py new file mode 100644 index 0000000000000000000000000000000000000000..119be2115ff5fbe26f45c46c066e86cc6f6c8496 --- /dev/null +++ b/tools/data_collection/get_suse_hardware_compatiable_list.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import urllib3 +from queue import Queue +import threading +import re +import html +import openpyxl + +''' +整体的思路: +1、构造任务队列pageQueue ,存放所有要爬取的页面url +2、用多线程爬虫将抓取的页面内容存放到data_queue中 +3、用多线程程序对data_queue中的页面内容进行解析,分别提取id, product, os, company, category, +''' + +class Crawl_thread(threading.Thread): + ''' + 抓取线程类,注意需要继承线程类Thread + ''' + def __init__(self,thread_id,queue): + threading.Thread.__init__(self) + self.thread_id = thread_id + self.queue = queue # 任务队列 + + def run(self): + ''' + 线程在调用过程中就会调用对应的run方法 + :return: + ''' + print('启动线程:',self.thread_id) + self.crawl_spider() + print('退出了该线程:',self.thread_id) + + def crawl_spider(self): + while True: + if self.queue.empty(): #如果队列为空,则跳出 + break + else: + page = self.queue.get() + print('当前工作的线程为:',self.thread_id," 正在采集:",page) + try: + http = urllib3.PoolManager(10) + url = "https://www.suse.com/nbswebapp/yesBulletin.jsp?bulletinNumber=" + str(page) + page_data = http.request('GET', url) + status_code = page_data.status + if status_code == 200: + page_html = page_data.data.decode('utf-8') + data_queue.put((page_html, page)) + except Exception as e: + print('采集线程错误',e) + +class Parser_thread(threading.Thread): + ''' + 解析网页的类,就是对采集结果进行解析,也是多线程方式进行解析 + ''' + def __init__(self,thread_id,queue): + threading.Thread.__init__(self) + self.thread_id = thread_id + self.queue = queue + + def run(self): + print('启动线程:', self.thread_id) + while not flag: + try: + item = self.queue.get(False) # get参数为false时队列为空,会抛出异常 + if not item: + pass + self.parse_data(item) + except Exception as e: + pass + print('退出了该线程:', self.thread_id) + def parse_data(self,item): + ''' + 解析网页内容的函数 + :param item: + :return: + ''' + try: + print("开始解析:", self.thread_id) + page = item[0] + id = item[1] + product = "" + category = "" + company = "" + os = "" + regex = re.compile('(.*?)', re.S) + result = regex.findall(page) + regex_category = re.compile('(.*?)', re.S) + result_category = regex_category.findall(page) + regex_company = re.compile('For more information regarding the specific test configuration, please contact:
(.*?)
(.*?)(.*?)', re.S) + result_company = regex_company.findall(page) + regex_os = re.compile('Operating Systems:(.*?)
(.*?)(.*?)(.*?)', re.S) + result_os = regex_os.findall(page) + + for item in result: + product += html.unescape(item.strip()).strip() + + for item in result_category: + category += html.unescape(item.strip()).strip() + + for item in result_company: + company += html.unescape(item[-1].strip()).strip() + + for item in result_os: + os += html.unescape(item[-1].strip()).strip() + + response = { + 'id': id, + 'product': product, + 'category': category, + 'company': company, + 'os': os + } + + all_data.append(response) + except Exception as e: + print('parse: ',e) + + +data_queue = Queue(50) +all_data = [] +flag = False +def main(): + pageQueue = Queue(100) # 任务队列,存放网页的队列 + for page in range(101400, 101500): + pageQueue.put(page) # 构造任务队列 + # 初始化采集线程 + crawl_threads = [] + crawl_name_list = ['crawl_' + str(i) for i in range(1, 6)] # 总共构造5个爬虫线程 + for thread_id in crawl_name_list: + thread = Crawl_thread(thread_id, pageQueue) # 启动爬虫线程 + thread.start() # 启动线程 + crawl_threads.append(thread) + + # 初始化解析线程 + parse_thread = [] + parser_name_list = ['parse_' + str(i) for i in range(1, 6)] + for thread_id in parser_name_list: + thread = Parser_thread(thread_id,data_queue) + thread.start() # 启动线程 + parse_thread.append(thread) + + # 等待队列情况,先进行网页的抓取 + while not pageQueue.empty(): + pass # 不为空,则继续阻塞 + + # 等待所有线程结束 + for t in crawl_threads: + t.join() + # 等待队列情况,对采集的页面队列中的页面进行解析,等待所有页面解析完成 + while not data_queue.empty(): + print("队列里面没有数据") + pass + # 通知线程退出 + global flag + flag = True + for t in parse_thread: + t.join() # 等待所有线程执行到此处再继续往下执行 + + wb = openpyxl.Workbook() + sheet = wb.active + + for col, head in enumerate(all_data[0].keys()): + sheet.cell(row=1, column=col+1, value=head) + + for row, sheet_row in enumerate(all_data): + for col, row_key in enumerate(sheet_row.keys()): + sheet.cell(row=row+2, column=col+1, value=sheet_row[row_key]) + wb.save(filename="suse_compatiable_list.xlsx") + +if __name__ == '__main__': + main()
(.*?)