""" Create data in city;yyyy-mm-dd HH:mm:ss.SSS;temp format Avg line length: ~36 bytes Expected output file size: >3 GB -> 3 000 000 000 bytes Needed lines count = ~83.(3)m 66 district cities * 75 years with measurement twice per hour = 66*75*365*24*2 = 86 724 000 -> ~3.1 GB """ import datetime import numpy as np cities = [ "Biała Podlaska", "Białystok", "Bielsko-Biała", "Bydgoszcz", "Bytom", "Chełm", "Chorzów", "Częstochowa", "Dąbrowa Górnicza", "Elbląg", "Gdańsk", "Gdynia", "Gliwice", "Gorzów Wielkopolski", "Grudziądz", "Jastrzębie-Zdrój", "Jaworzno", "Jelenia Góra", "Kalisz", "Katowice", "Kielce", "Konin", "Koszalin", "Kraków", "Krosno", "Legnica", "Leszno", "Lublin", "Łomża", "Łódź", "Mysłowice", "Nowy Sącz", "Olsztyn", "Opole", "Ostrołęka", "Piekary Śląskie", "Piotrków Trybunalski", "Płock", "Poznań", "Przemyśl", "Radom", "Ruda Śląska", "Rybnik", "Rzeszów", "Siedlce", "Siemianowice Śląskie", "Skierniewice", "Słupsk", "Sopot", "Sosnowiec", "Suwałki", "Szczecin", "Świętochłowice", "Świnoujście", "Tarnobrzeg", "Tarnów", "Toruń", "Tychy", "Wałbrzych", "Włocławek", "Wrocław", "Zabrze", "Zamość", "Zielona Góra", "Żory" ] begin_date = datetime.datetime(year=1949, month=1, day=1, hour=0, minute=0, second=0) end_date = begin_date + datetime.timedelta(days=365 * 75) generator = np.random.default_rng(790492283396) batch = iter(generator.integers(low=-1500, high=3500, size=66*75*365*24*2)) start = datetime.datetime.now() with open('../data/temperatures.csv', 'w', encoding='utf-8') as target: for city in cities: print(city) now = begin_date while now < end_date: target.write("{};{}.000;{}\n".format(city, now, int(next(batch)) / 100.0)) now += datetime.timedelta(minutes=30) end = datetime.datetime.now() print("Completed in {}".format(end - start))