large-file-reading-challenge/util/test-generator.py

100 lines
2.1 KiB
Python

"""
Create data in city;yyyy-mm-dd HH:mm:ss.SSS;temp format
Avg line length: ~36 bytes
Expected output file size: >3 GB -> 3 000 000 000 bytes
Needed lines count = ~83.(3)m
66 district cities * 75 years with measurement twice per hour = 66*75*365*24*2 = 86 724 000 -> ~3.1 GB
"""
import datetime
import numpy as np
cities = [
"Biała Podlaska",
"Białystok",
"Bielsko-Biała",
"Bydgoszcz",
"Bytom",
"Chełm",
"Chorzów",
"Częstochowa",
"Dąbrowa Górnicza",
"Elbląg",
"Gdańsk",
"Gdynia",
"Gliwice",
"Gorzów Wielkopolski",
"Grudziądz",
"Jastrzębie-Zdrój",
"Jaworzno",
"Jelenia Góra",
"Kalisz",
"Katowice",
"Kielce",
"Konin",
"Koszalin",
"Kraków",
"Krosno",
"Legnica",
"Leszno",
"Lublin",
"Łomża",
"Łódź",
"Mysłowice",
"Nowy Sącz",
"Olsztyn",
"Opole",
"Ostrołęka",
"Piekary Śląskie",
"Piotrków Trybunalski",
"Płock",
"Poznań",
"Przemyśl",
"Radom",
"Ruda Śląska",
"Rybnik",
"Rzeszów",
"Siedlce",
"Siemianowice Śląskie",
"Skierniewice",
"Słupsk",
"Sopot",
"Sosnowiec",
"Suwałki",
"Szczecin",
"Świętochłowice",
"Świnoujście",
"Tarnobrzeg",
"Tarnów",
"Toruń",
"Tychy",
"Wałbrzych",
"Włocławek",
"Wrocław",
"Zabrze",
"Zamość",
"Zielona Góra",
"Żory"
]
begin_date = datetime.datetime(year=1949, month=1, day=1, hour=0, minute=0, second=0)
end_date = begin_date + datetime.timedelta(days=365 * 75)
generator = np.random.default_rng(790492283396)
batch = iter(generator.integers(low=-1500, high=3500, size=66*75*365*24*2))
start = datetime.datetime.now()
with open('../data/temperatures.csv', 'w', encoding='utf-8') as target:
for city in cities:
print(city)
now = begin_date
while now < end_date:
target.write("{};{}.000;{}\n".format(city, now, int(next(batch)) / 100.0))
now += datetime.timedelta(minutes=30)
end = datetime.datetime.now()
print("Completed in {}".format(end - start))