diff --git a/contributors/bob-zhao/pdf2imgs.py b/contributors/bob-zhao/pdf2imgs.py new file mode 100644 index 0000000000000000000000000000000000000000..9827e94d1435d7094458e28bd149a78031d2f9da --- /dev/null +++ b/contributors/bob-zhao/pdf2imgs.py @@ -0,0 +1,32 @@ +import os +from pdf2image import convert_from_path + +def pdf2imgs(pdf_path:str, out_dir=".") -> None: + """to change the pdf file to a folder of images + + Args: + pdf_path (str): the pdf file path, + out_dir (str, optional): the output dir. Defaults to ".". + + Raises: + ValueError: the pdf_path not available + + Example: + >>> pdf2imgs("test.pdf", "./test") + """ + + assert isinstance(pdf_path, str), "pdf_path must be str" + assert isinstance(out_dir, str), "out_dir must be str" + + pdf_name = pdf_path[pdf_path.rfind("/")+1:pdf_path.rfind(".")] + out_dir = os.path.join(out_dir, pdf_name) + + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + if not (pdf_path.endswith("pdf") or pdf_path.endswith("ai")): + raise ValueError("file must end with .pdf or .ai") + + images = convert_from_path(pdf_path) + for i, img in enumerate(images): + img.save(os.path.join(out_dir, f"page{i}.jpg")) diff --git a/contributors/bob-zhao/requirements.txt b/contributors/bob-zhao/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c44275a9fef81062fbc4b27f7b59cd183749eeba --- /dev/null +++ b/contributors/bob-zhao/requirements.txt @@ -0,0 +1 @@ +pdf2image=1.16.0 \ No newline at end of file diff --git a/contributors/bob-zhao/test.py b/contributors/bob-zhao/test.py new file mode 100644 index 0000000000000000000000000000000000000000..c97cd86e4fead4cf85640cc2e114dea2952487a3 --- /dev/null +++ b/contributors/bob-zhao/test.py @@ -0,0 +1,4 @@ +from pdf2imgs import pdf2imgs + +pdf2imgs("word2vec.pdf", "./testfile") + diff --git a/contributors/bob-zhao/word2vec.pdf b/contributors/bob-zhao/word2vec.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c7bf771632081c4f27f736ecbab529558a844da8 Binary files /dev/null and b/contributors/bob-zhao/word2vec.pdf differ