From ee6c53769b4f4749d901efcc504ff457f16e42ac Mon Sep 17 00:00:00 2001
From: lvmingfu <lvmingfu@huawei.com>
Date: Mon, 21 Feb 2022 09:49:55 +0800
Subject: [PATCH] add white function in lint_link.py

---
 .../docs/source_en/shor_algorithm.md          |  2 +-
 tools/link_detection/README_CN.md             | 17 +++++++
 tools/link_detection/filter_linklint.txt      | 14 ++++++
 tools/link_detection/link_lint.py             | 45 +++++++++++++++----
 4 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100644 tools/link_detection/filter_linklint.txt

diff --git a/docs/mindquantum/docs/source_en/shor_algorithm.md b/docs/mindquantum/docs/source_en/shor_algorithm.md
index e84e07b0f2..282ade39f2 100644
--- a/docs/mindquantum/docs/source_en/shor_algorithm.md
+++ b/docs/mindquantum/docs/source_en/shor_algorithm.md
@@ -1,6 +1,6 @@
 # Shor's algorithm based on MindQuantum
 
-[![View Source](https://gitee.com/mindspore/docs/raw/master/resource/_static/logo_source_en.png)](https://gitee.com/mindspore/docs/tree/master/docs/mindquantum/docs/source_en/shor_algorithm.md)
+[![View Source](https://gitee.com/mindspore/docs/raw/master/resource/_static/logo_source_en.png)](https://gitee.com/mindspore/docs/blob/master/docs/mindquantum/docs/source_en/shor_algorithm.md)
 
 ## Introduction to Shor's Algorithm
 
diff --git a/tools/link_detection/README_CN.md b/tools/link_detection/README_CN.md
index d9ca6befb3..271abab0ed 100644
--- a/tools/link_detection/README_CN.md
+++ b/tools/link_detection/README_CN.md
@@ -51,3 +51,20 @@ docs/tutorials/source_zh_cn/intermediate/text/sentimentnet.ipynb:line_22:404: Er
 - 报错的行数：`line_22`。即检测文件中第22行报错。
 
 - 报错代码：`404`。即该行中存在状态码是404的链接，即不存在该网址。
+
+## 检测白名单设置
+
+`filter_linklint.txt`文件中存储着链接检测的白名单，每行可写一个链接列入白名单。
+该文件默认放在与`link_lint.py`同目录。也可以通过命令传入`--white_path={白名单文件地址}`载入指定地址的白名单文件。
+
+```bash
+python link_lint.py --white_path=xxx/xxx/xx.txt xxx xxx
+```
+
+白名单内容可书写如下：
+
+```text
+https://xxxxx.com
+https://xxx.com/xxx.html
+...
+```
\ No newline at end of file
diff --git a/tools/link_detection/filter_linklint.txt b/tools/link_detection/filter_linklint.txt
new file mode 100644
index 0000000000..47d257eeb4
--- /dev/null
+++ b/tools/link_detection/filter_linklint.txt
@@ -0,0 +1,14 @@
+https://
+https://xxx
+http://127.0.0.1:8080
+https://127.0.0.0:6666
+https://127.0.0.0
+http://127.0.0.1:1500/model/lenet/version/1:predict
+http://localhost:%d
+http://xxx/v1/mindinsight/profile/cluster-flops
+http://xxxx/v1/mindinsight/debugger/sessions/xxxx/update-watchpoint
+http://%s:%s%s
+http://localhost:5500/x/:add_common
+http://xxxx/v1/mindinsight/profile/memory-graphics
+https://gitee.com/mindspore/docs/blob/xxx
+http://192.168.216.124:11202/scaleout
diff --git a/tools/link_detection/link_lint.py b/tools/link_detection/link_lint.py
index fdedbe9fb8..6725c007b3 100644
--- a/tools/link_detection/link_lint.py
+++ b/tools/link_detection/link_lint.py
@@ -36,7 +36,7 @@ def get_urls(content):
     url_list = []
     urls = re.findall(re_url, content)
     for url in urls:
-        url_list.append(url[0]+url[1].replace("\n", ""))
+        url_list.append(url[0]+url[1].split("\n\n")[0].replace("\n", ""))
     return url_list
 
 def check_url_status(url):
@@ -94,7 +94,9 @@ def run_check(file):
     检测文件中的urls链接
     """
     data = get_content(file)
-    urls = get_urls(data)
+    file_urls = get_urls(data)
+    white_urls = get_white_urls()
+    urls = set(file_urls) - set(white_urls)
     pool = []
     for url in urls:
         k = threading.Thread(target=update_url_status_to_json, args=(url,))
@@ -104,16 +106,40 @@ def run_check(file):
         j.join()
 
     generate_info(file)
-    os.remove("url_status.json")
+    if os.path.exists("url_status.json"):
+        os.remove("url_status.json")
+
+def get_white_urls(white_file="filter_linklint.txt"):
+    """获取白名单中的链接"""
+    for i in sys.argv[1:]:
+        if "--white_path=" in i:
+            white_file = i.split("=")[-1]
+    if os.path.exists(white_file):
+        try:
+            with open(white_file, "r", encoding="utf-8") as f:
+                urls = f.readlines()
+        except Exception:
+            with open(white_file, "r", encoding="GBK") as f:
+                urls = f.readlines()
+    else:
+        urls = []
+    return urls
 
 def generate_info(file):
     """
     输出404链接的信息
     """
-    with open("url_status.json", "r") as f:
-        url_status = json.load(f)
-    with open(file, "r", encoding="utf-8") as f:
-        lines = f.readlines()
+    if os.path.exists("url_status.json"):
+        with open("url_status.json", "r") as f:
+            url_status = json.load(f)
+    else:
+        url_status = {"https://www.mindspore.cn": 200}
+    try:
+        with open(file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+    except Exception:
+        with open("filter_linklint.txt", "r", encoding="GBK") as f:
+            lines = f.readlines()
     for line_num, line_content in enumerate(lines, 1):
         for i in get_urls(line_content):
             if url_status[i] == 404:
@@ -125,9 +151,10 @@ def generate_info(file):
 
 if __name__ == "__main__":
     for check_path_ in sys.argv[1:]:
-        if os.path.isfile(check_path_):
+        extension = ["md", "py", "rst", "ipynb", "js", "html", "c", "cc", "txt"]
+        if os.path.isfile(check_path_) and check_path_.split(".")[-1] in extension:
             run_check(check_path_)
         elif os.path.isdir(check_path_):
-            check_f_ = [file for file in find_file(check_path_, files=[])]
+            check_f_ = [file for file in find_file(check_path_, files=[]) if file.split(".")[-1] in extension]
             for one_f in check_f_:
                 run_check(one_f)
-- 
Gitee