Skip to content

Commit a9d670b

Browse files
committed
Merge main back into 6-refactor-handling-settings (#7)
* Added script to download the data sets from IMDb. * Decompress the gzip files into another directory. * Fixed typo in docstring * Moved everything to a new class. Removed most of the settings stored in a global variable. Renamed block size to chunk size. Fixed the progress bars and spent way too long trying to work out the uncompressed size of gzip'd file. * Docstrings comments Changed chunk-size default * Renamed project in pyproject.toml * Mark project as not a package (for now) * Added some more details to the CLI * Download all IMDB files Fix chunk size * Following code review from @sfkleach moved functions to a new method and removed globals for settings and class mappings. * Bumped versions. Add compressed sqlite database to prebuilt. Merge main back into 6-refactor-handling-settings (#7)
1 parent 2ad3d38 commit a9d670b

1 file changed

Lines changed: 156 additions & 0 deletions

File tree

download_imdb.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Download dataset files from IMDb and then decompress them."""
2+
3+
import os
4+
import gzip
5+
import argparse
6+
import requests
7+
from tqdm import tqdm
8+
9+
# Default configuration settings
10+
SETTINGS = {
11+
"data_files": {
12+
"name.basics.tsv.gz",
13+
"title.akas.tsv.gz",
14+
"title.basics.tsv.gz",
15+
"title.crew.tsv.gz",
16+
"title.episode.tsv.gz",
17+
"title.principals.tsv.gz",
18+
"title.ratings.tsv.gz",
19+
},
20+
}
21+
22+
23+
class IMDbDownloader:
24+
"""Base downloader for IMDb data sets"""
25+
26+
def __init__(self, download_dir, zip_file, chunk_size=1024):
27+
"""Initialise IMDbDownloader object
28+
29+
Args:
30+
download_dir (string): Directory to download data file to
31+
zip_file (string): Name of compressed data file to download
32+
chunk_size (int, optional): Chunk size when downloading and decompressing data files.
33+
34+
Defaults to 1024.
35+
"""
36+
self.download_dir = download_dir
37+
self.zip_file = zip_file
38+
self.unzip_file = os.path.splitext(zip_file)[0]
39+
self.chunk_size = chunk_size
40+
41+
def download_file(self, data_location):
42+
"""Download non-commercial data sets from IMDb.
43+
44+
Args:
45+
data_location (string): URL where files are located to download
46+
"""
47+
url = data_location + self.zip_file
48+
49+
# Streaming, so we can iterate over the response.
50+
response = requests.get(url, stream=True, timeout=60)
51+
total_size = int(response.headers.get("content-length", 0))
52+
53+
# Format for tqdm progress bar
54+
bar_format = "Progress: {l_bar}{bar} | Completed: {n_fmt} | Time: [{elapsed}]"
55+
56+
# Create the download directory if it doesn't exist
57+
os.makedirs(self.download_dir, exist_ok=True)
58+
download_path = os.path.join(self.download_dir, self.zip_file)
59+
60+
with tqdm.wrapattr(
61+
open(download_path, "wb"),
62+
"write",
63+
total=total_size,
64+
unit="B",
65+
unit_scale=True,
66+
desc=f"Downloading {self.zip_file}",
67+
bar_format=bar_format,
68+
leave=False,
69+
) as f_out:
70+
for chunk in response.iter_content(chunk_size=self.chunk_size):
71+
f_out.write(chunk)
72+
73+
def decompress_file(self, output_dir):
74+
"""Decompress the gzipped datasets
75+
76+
Args:
77+
output_dir (string): Directory to write decompressed files to
78+
"""
79+
80+
# Create the import directory if it doesn't exist
81+
os.makedirs(output_dir, exist_ok=True)
82+
input_file = os.path.join(self.download_dir, self.zip_file)
83+
84+
# Remove the file extension for the uncompressed output file
85+
output_file = os.path.join(output_dir, self.unzip_file)
86+
87+
# Format for tqdm progress bar
88+
bar_format = "Completed: {n_fmt} | Time: [{elapsed}]"
89+
with gzip.open(input_file, "rb") as f_in:
90+
with open(output_file, "wb") as f_out:
91+
p_bar = tqdm(
92+
unit="B",
93+
unit_scale=True,
94+
desc=f"Decompressing {self.zip_file}",
95+
bar_format=bar_format,
96+
leave=False,
97+
)
98+
while True:
99+
block = f_in.read(self.chunk_size)
100+
if not block:
101+
break
102+
f_out.write(block)
103+
p_bar.update(len(block))
104+
105+
106+
if __name__ == "__main__":
107+
108+
# Parse command-line arguments
109+
parser = argparse.ArgumentParser(
110+
prog="download_imdb",
111+
description="Download non-commercial data sets from IMDb.",
112+
epilog="You can process the files with load_imdb.",
113+
)
114+
parser.add_argument(
115+
"--data_location",
116+
type=str,
117+
default="https://datasets.imdbws.com/",
118+
help="Location of the IMDb non-commercial data files",
119+
)
120+
parser.add_argument(
121+
"--download_dir",
122+
type=str,
123+
default="downloads",
124+
help="Directory to save downloaded files",
125+
)
126+
parser.add_argument(
127+
"--output_dir",
128+
type=str,
129+
default="import",
130+
help="Directory to save unzipped dataset files (default:import)",
131+
)
132+
parser.add_argument(
133+
"--chunk_size",
134+
type=int,
135+
default=1048576,
136+
help="Block size in bytes when downloading files (default: 1048576)",
137+
)
138+
parser.add_argument(
139+
"--no_download", action="store_true", help="Don't download dataset files"
140+
)
141+
parser.add_argument(
142+
"--no_decompress", action="store_true", help="Don't decompress dataset files"
143+
)
144+
args = parser.parse_args()
145+
146+
for file in tqdm(
147+
SETTINGS["data_files"],
148+
total=len(SETTINGS["data_files"]),
149+
desc="Processing IMDb dataset files",
150+
):
151+
imdb_downloader = IMDbDownloader(args.download_dir, file)
152+
if not args.no_download:
153+
imdb_downloader.download_file(data_location=args.data_location)
154+
155+
if not args.no_decompress:
156+
imdb_downloader.decompress_file(args.output_dir)

0 commit comments

Comments
 (0)