123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- import logging
- import os
- import socket
- import time
- from hashlib import sha256
- from urllib3 import PoolManager, Retry
- from urllib3.response import BaseHTTPResponse
- from urllib3.util import Timeout
- from openpilot.common.file_helpers import atomic_write_in_dir
- from openpilot.system.hardware.hw import Paths
- # Cache chunk size
- K = 1000
- CHUNK_SIZE = 1000 * K
- logging.getLogger("urllib3").setLevel(logging.WARNING)
- def hash_256(link: str) -> str:
- hsh = str(sha256((link.split("?")[0]).encode('utf-8')).hexdigest())
- return hsh
- class URLFileException(Exception):
- pass
- class URLFile:
- _pool_manager: PoolManager|None = None
- @staticmethod
- def reset() -> None:
- URLFile._pool_manager = None
- @staticmethod
- def pool_manager() -> PoolManager:
- if URLFile._pool_manager is None:
- socket_options = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),]
- retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[409, 429, 503, 504])
- URLFile._pool_manager = PoolManager(num_pools=10, maxsize=100, socket_options=socket_options, retries=retries)
- return URLFile._pool_manager
- def __init__(self, url: str, timeout: int=10, debug: bool=False, cache: bool|None=None):
- self._url = url
- self._timeout = Timeout(connect=timeout, read=timeout)
- self._pos = 0
- self._length: int|None = None
- self._debug = debug
- # True by default, false if FILEREADER_CACHE is defined, but can be overwritten by the cache input
- self._force_download = not int(os.environ.get("FILEREADER_CACHE", "0"))
- if cache is not None:
- self._force_download = not cache
- if not self._force_download:
- os.makedirs(Paths.download_cache_root(), exist_ok=True)
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_value, traceback) -> None:
- pass
- def _request(self, method: str, url: str, headers: dict[str, str]|None=None) -> BaseHTTPResponse:
- return URLFile.pool_manager().request(method, url, timeout=self._timeout, headers=headers)
- def get_length_online(self) -> int:
- response = self._request('HEAD', self._url)
- if not (200 <= response.status <= 299):
- return -1
- length = response.headers.get('content-length', 0)
- return int(length)
- def get_length(self) -> int:
- if self._length is not None:
- return self._length
- file_length_path = os.path.join(Paths.download_cache_root(), hash_256(self._url) + "_length")
- if not self._force_download and os.path.exists(file_length_path):
- with open(file_length_path) as file_length:
- content = file_length.read()
- self._length = int(content)
- return self._length
- self._length = self.get_length_online()
- if not self._force_download and self._length != -1:
- with atomic_write_in_dir(file_length_path, mode="w") as file_length:
- file_length.write(str(self._length))
- return self._length
- def read(self, ll: int|None=None) -> bytes:
- if self._force_download:
- return self.read_aux(ll=ll)
- file_begin = self._pos
- file_end = self._pos + ll if ll is not None else self.get_length()
- assert file_end != -1, f"Remote file is empty or doesn't exist: {self._url}"
- # We have to align with chunks we store. Position is the begginiing of the latest chunk that starts before or at our file
- position = (file_begin // CHUNK_SIZE) * CHUNK_SIZE
- response = b""
- while True:
- self._pos = position
- chunk_number = self._pos / CHUNK_SIZE
- file_name = hash_256(self._url) + "_" + str(chunk_number)
- full_path = os.path.join(Paths.download_cache_root(), str(file_name))
- data = None
- # If we don't have a file, download it
- if not os.path.exists(full_path):
- data = self.read_aux(ll=CHUNK_SIZE)
- with atomic_write_in_dir(full_path, mode="wb") as new_cached_file:
- new_cached_file.write(data)
- else:
- with open(full_path, "rb") as cached_file:
- data = cached_file.read()
- response += data[max(0, file_begin - position): min(CHUNK_SIZE, file_end - position)]
- position += CHUNK_SIZE
- if position >= file_end:
- self._pos = file_end
- return response
- def read_aux(self, ll: int|None=None) -> bytes:
- download_range = False
- headers = {}
- if self._pos != 0 or ll is not None:
- if ll is None:
- end = self.get_length() - 1
- else:
- end = min(self._pos + ll, self.get_length()) - 1
- if self._pos >= end:
- return b""
- headers['Range'] = f"bytes={self._pos}-{end}"
- download_range = True
- if self._debug:
- t1 = time.time()
- response = self._request('GET', self._url, headers=headers)
- ret = response.data
- if self._debug:
- t2 = time.time()
- if t2 - t1 > 0.1:
- print(f"get {self._url} {headers!r} {t2 - t1:.3f} slow")
- response_code = response.status
- if response_code == 416: # Requested Range Not Satisfiable
- raise URLFileException(f"Error, range out of bounds {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
- if download_range and response_code != 206: # Partial Content
- raise URLFileException(f"Error, requested range but got unexpected response {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
- if (not download_range) and response_code != 200: # OK
- raise URLFileException(f"Error {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
- self._pos += len(ret)
- return ret
- def seek(self, pos:int) -> None:
- self._pos = pos
- @property
- def name(self) -> str:
- return self._url
- os.register_at_fork(after_in_child=URLFile.reset)
|