Eelalzep commited on
Commit
ec11a8b
·
verified ·
1 Parent(s): 9fb8a73

Upload archive-org-downloader.py

Browse files
Files changed (1) hide show
  1. archive-org-downloader.py +274 -0
archive-org-downloader.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import random, string
3
+ from concurrent import futures
4
+ from tqdm import tqdm
5
+ import time
6
+ from datetime import datetime
7
+ import argparse
8
+ import os
9
+ import sys
10
+ import shutil
11
+ import json
12
+
13
+ def display_error(response, message):
14
+ print(message)
15
+ print(response)
16
+ print(response.text)
17
+ exit()
18
+
19
+ def get_book_infos(session, url):
20
+ r = session.get(url).text
21
+ infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
22
+ response = session.get(infos_url)
23
+ data = response.json()['data']
24
+ title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
25
+ title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
26
+ title = title[:150] # Trim the title to avoid long file names
27
+ metadata = data['metadata']
28
+ links = []
29
+ for item in data['brOptions']['data']:
30
+ for page in item:
31
+ links.append(page['uri'])
32
+
33
+ if len(links) > 1:
34
+ print(f"[+] Found {len(links)} pages")
35
+ return title, links, metadata
36
+ else:
37
+ print(f"[-] Error while getting image links")
38
+ exit()
39
+
40
+ def format_data(content_type, fields):
41
+ data = ""
42
+ for name, value in fields.items():
43
+ data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
44
+ data += content_type+"--"
45
+ return data
46
+
47
+ def login(email, password):
48
+ session = requests.Session()
49
+ session.get("https://archive.org/account/login")
50
+ content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
51
+
52
+ headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
53
+ data = format_data(content_type, {"username":email, "password":password, "submit_by_js":"true"})
54
+
55
+ response = session.post("https://archive.org/account/login", data=data, headers=headers)
56
+ if "bad_login" in response.text:
57
+ print("[-] Invalid credentials!")
58
+ exit()
59
+ elif "Successful login" in response.text:
60
+ print("[+] Successful login")
61
+ return session
62
+ else:
63
+ display_error(response, "[-] Error while login:")
64
+
65
+ def loan(session, book_id, verbose=True):
66
+ data = {
67
+ "action": "grant_access",
68
+ "identifier": book_id
69
+ }
70
+ response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
71
+ data['action'] = "browse_book"
72
+ response = session.post("https://archive.org/services/loans/loan/", data=data)
73
+
74
+ if response.status_code == 400 :
75
+ if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
76
+ print("This book doesn't need to be borrowed")
77
+ return session
78
+ else :
79
+ display_error(response, "Something went wrong when trying to borrow the book.")
80
+
81
+ data['action'] = "create_token"
82
+ response = session.post("https://archive.org/services/loans/loan/", data=data)
83
+
84
+ if "token" in response.text:
85
+ if verbose:
86
+ print("[+] Successful loan")
87
+ return session
88
+ else:
89
+ display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
90
+
91
+ def return_loan(session, book_id):
92
+ data = {
93
+ "action": "return_loan",
94
+ "identifier": book_id
95
+ }
96
+ response = session.post("https://archive.org/services/loans/loan/", data=data)
97
+ if response.status_code == 200 and response.json()["success"]:
98
+ print("[+] Book returned")
99
+ else:
100
+ display_error(response, "Something went wrong when trying to return the book")
101
+
102
+ def image_name(pages, page, directory):
103
+ return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
104
+
105
+ def download_one_image(session, link, i, directory, book_id, pages):
106
+ headers = {
107
+ "Referer": "https://archive.org/",
108
+ "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
109
+ "Sec-Fetch-Site": "same-site",
110
+ "Sec-Fetch-Mode": "no-cors",
111
+ "Sec-Fetch-Dest": "image",
112
+ }
113
+ retry = True
114
+ while retry:
115
+ try:
116
+ response = session.get(link, headers=headers)
117
+ if response.status_code == 403:
118
+ session = loan(session, book_id, verbose=False)
119
+ raise Exception("Borrow again")
120
+ elif response.status_code == 200:
121
+ retry = False
122
+ except:
123
+ time.sleep(1) # Wait 1 second before retrying
124
+
125
+ image = image_name(pages, i, directory)
126
+ with open(image,"wb") as f:
127
+ f.write(response.content)
128
+
129
+
130
+ def download(session, n_threads, directory, links, scale, book_id):
131
+ print("Downloading pages...")
132
+ links = [f"{link}&rotate=0&scale={scale}" for link in links]
133
+ pages = len(links)
134
+
135
+ tasks = []
136
+ with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
137
+ for link in links:
138
+ i = links.index(link)
139
+ tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
140
+ for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
141
+ pass
142
+
143
+ images = [image_name(pages, i, directory) for i in range(len(links))]
144
+ return images
145
+
146
+ def make_pdf(pdf, title, directory):
147
+ file = title+".pdf"
148
+ # Handle the case where multiple books with the same name are downloaded
149
+ i = 1
150
+ while os.path.isfile(os.path.join(directory, file)):
151
+ file = f"{title}({i}).pdf"
152
+ i += 1
153
+
154
+ with open(os.path.join(directory, file),"wb") as f:
155
+ f.write(pdf)
156
+ print(f"[+] PDF saved as \"{file}\"")
157
+
158
+ if __name__ == "__main__":
159
+
160
+ my_parser = argparse.ArgumentParser()
161
+ my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
162
+ my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
163
+ my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
164
+ my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
165
+ my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
166
+ my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
167
+ my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
168
+ my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
169
+ my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
170
+
171
+ if len(sys.argv) == 1:
172
+ my_parser.print_help(sys.stderr)
173
+ sys.exit(1)
174
+ args = my_parser.parse_args()
175
+
176
+ if args.url is None and args.file is None:
177
+ my_parser.error("At least one of --url and --file required")
178
+
179
+ email = args.email
180
+ password = args.password
181
+ scale = args.resolution
182
+ n_threads = args.threads
183
+ d = args.dir
184
+
185
+ if d == None:
186
+ d = os.getcwd()
187
+ elif not os.path.isdir(d):
188
+ print(f"Output directory does not exist!")
189
+ exit()
190
+
191
+ if args.url is not None:
192
+ urls = args.url
193
+ else:
194
+ if os.path.exists(args.file):
195
+ with open(args.file) as f:
196
+ urls = f.read().strip().split("\n")
197
+ else:
198
+ print(f"{args.file} does not exist!")
199
+ exit()
200
+
201
+ # Check the urls format
202
+ for url in urls:
203
+ if not url.startswith("https://archive.org/details/"):
204
+ print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
205
+ exit()
206
+
207
+ print(f"{len(urls)} Book(s) to download")
208
+ session = login(email, password)
209
+
210
+ for url in urls:
211
+ book_id = list(filter(None, url.split("/")))[3]
212
+ print("="*40)
213
+ print(f"Current book: https://archive.org/details/{book_id}")
214
+ session = loan(session, book_id)
215
+ title, links, metadata = get_book_infos(session, url)
216
+
217
+ directory = os.path.join(d, title)
218
+ # Handle the case where multiple books with the same name are downloaded
219
+ i = 1
220
+ _directory = directory
221
+ while os.path.isdir(directory):
222
+ directory = f"{_directory}({i})"
223
+ i += 1
224
+ os.makedirs(directory)
225
+
226
+ if args.meta:
227
+ print("Writing metadata.json...")
228
+ with open(f"{directory}/metadata.json",'w') as f:
229
+ json.dump(metadata,f)
230
+
231
+ images = download(session, n_threads, directory, links, scale, book_id)
232
+
233
+ if not args.jpg: # Create pdf with images and remove the images folder
234
+ import img2pdf
235
+
236
+ # prepare PDF metadata
237
+ # sometimes archive metadata is missing
238
+ pdfmeta = { }
239
+ # ensure metadata are str
240
+ for key in ["title", "creator", "associated-names"]:
241
+ if key in metadata:
242
+ if isinstance(metadata[key], str):
243
+ pass
244
+ elif isinstance(metadata[key], list):
245
+ metadata[key] = "; ".join(metadata[key])
246
+ else:
247
+ raise Exception("unsupported metadata type")
248
+ # title
249
+ if 'title' in metadata:
250
+ pdfmeta['title'] = metadata['title']
251
+ # author
252
+ if 'creator' in metadata and 'associated-names' in metadata:
253
+ pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
254
+ elif 'creator' in metadata:
255
+ pdfmeta['author'] = metadata['creator']
256
+ elif 'associated-names' in metadata:
257
+ pdfmeta['author'] = metadata['associated-names']
258
+ # date
259
+ if 'date' in metadata:
260
+ try:
261
+ pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
262
+ except:
263
+ pass
264
+ # keywords
265
+ pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
266
+
267
+ pdf = img2pdf.convert(images, **pdfmeta)
268
+ make_pdf(pdf, title, args.dir if args.dir != None else "")
269
+ try:
270
+ shutil.rmtree(directory)
271
+ except OSError as e:
272
+ print ("Error: %s - %s." % (e.filename, e.strerror))
273
+
274
+ return_loan(session, book_id)