U
    d'                     @   s^  d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlm  m  m  mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$ e$ Z%eG dd dZ&G dd dZ'eee(df e	e e(dddZ)e ee
e( e
e* f dddZ+e&eee(df e	e ee*ef dddZ,dS )    N)	dataclassfield)AnyCallableDictListOptionalTupleUnion)eventsmetrics)
WorkerSpec)LocalElasticAgent)SignalExceptionStd)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_loggerc                   @   s  e Zd ZU dZeed< eed< eed< dZeed< dZeed< dZ	eed	< d
Z
eed< eedZeeef ed< dZeed< dZeed< dZeed< dZeed< dZee ed< ejZeeeeef f ed< ejZeeeeef f ed< eedZeeef ed< dd ZdS )LaunchConfiga	  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        log_dir: base log directory where log files are written. If not set,
                one is created in a tmp dir but NOT removed on exit.
        redirects: configuration to redirect stdout/stderr to log files.
                Pass a single ``Std`` enum to redirect all workers,
                or a mapping keyed by local_rank to selectively redirect.
        tee: configuration to "tee" stdout/stderr to console + log file.
        metrics_cfg: configuration to initialize metrics.

    ..note:
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_node run_idZdefault_rolerolerdzv_endpointZetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restarts   monitor_intervalspawnstart_methodNlog_dir	redirectsteemetrics_cfgc                 C   s4   d}| j dkr| j | jd< nd| jkr0|| jd< d S )Ni  r    timeout)r!   r   )selfZdefault_timeout r.   B/tmp/pip-unpacked-wheel-ua33x9lu/torch/distributed/launcher/api.py__post_init__W   s
    

zLaunchConfig.__post_init__)__name__
__module____qualname____doc__int__annotations__r   strr   r   r   r   dictr   r   r   r!   r#   r%   floatr'   r(   r   r   NONEr)   r
   r*   r+   r0   r.   r.   r.   r/   r      s$   
)r   c                   @   s2   e Zd ZdZeeeedf dddZdd Z	dS )elastic_launcha  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        ouptuts = elestic_launch(LaunchConfig, "script.py")(args)
        ouptuts = elestic_launch(LaunchConfig, "python")("script.py")
    N)config
entrypointc                 C   s   || _ || _d S N)_config_entrypoint)r-   r<   r=   r.   r.   r/   __init__z   s    zelastic_launch.__init__c                 G   s   t | j| jt|S r>   )launch_agentr?   r@   list)r-   argsr.   r.   r/   __call__   s    zelastic_launch.__call__)
r1   r2   r3   r4   r   r
   r   r7   rA   rE   r.   r.   r.   r/   r;   _   s
   r;   )r=   rD   returnc                 C   sF   t | tr| jS t | tr>| tjkr8tdd |D dS | S ndS dS )a  Retrive entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypont.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c                 s   s   | ]}|d  dkr|V  qdS )r   -Nr.   ).0argr.   r.   r/   	<genexpr>   s      z'_get_entrypoint_name.<locals>.<genexpr>r   N)
isinstancer   r1   r7   sys
executablenext)r=   rD   r.   r.   r/   _get_entrypoint_name   s    


rO   )rdzv_parametersrF   c                 C   sX   | j dkrdS | j}| }|s(tdt|dd\}}|dkrPtd| d||fS )NZstatic)NNzKEndpoint is missing in endpoint. Try to add --master_addr and --master_portr    )default_portzport is missing in endpoint: z. Try to specify --master_port)backendendpointstrip
ValueErrorr   )rP   rS   master_addrmaster_portr.   r.   r/   _get_addr_and_port   s    

rX   )r<   r=   rD   rF   c                 C   s  | j s*tt j}td|  || _ t||}td| d| j	 d| j
 d| j d| j  d| j d| j d	| j d
| j d| j d| j d| j d tf | j| j| j | j	| j
d| j}t|\}}t| j| j|t|t|| j| j| j| j||d}t|| j| jd}	d}
zzLt !t "| j |	# }t$%|	&  |' r^t(||j)d|j*W W jS  t(k
r    Y nN t+k
r   d}
t$%|	,   Y n& t-k
r   t$%|	,   Y nX W 5 |
r|j  X d S )Nz1config has no run_id, generated a random run_id: zDStarting elastic_operator with launch configs:
  entrypoint       : z
  min_nodes        : z
  max_nodes        : z
  nproc_per_node   : z
  run_id           : z
  rdzv_backend     : z
  rdzv_endpoint    : z
  rdzv_configs     : z
  max_restarts     : z
  monitor_interval : z
  log_dir          : z
  metrics_cfg      : 
)rR   rS   r   r   r   )r   Zlocal_world_sizer=   rD   rdzv_handlerr#   r%   r)   r*   rV   rW   )specr'   r(   T)namefailuresF).r   r7   uuidZuuid4r5   loggerwarningrO   infor   r   r   r   r   r   r#   r%   r(   r+   r   rX   r   r   tuplerdzv_registryZget_rendezvous_handlerr)   r*   r   r'   rZ   shutdownr   Zinitialize_metricsZMetricsConfigrunr   recordZget_event_succeededZ	is_failedr   r]   Zreturn_valuesr   Zget_event_failed	Exception)r<   r=   rD   r   Zentrypoint_namerP   rV   rW   r[   ZagentZshutdown_rdzvresultr.   r.   r/   rB      st    
b	  
rB   )-rL   r^   Zdataclassesr   r   typingr   r   r   r   r   r	   r
   Z-torch.distributed.elastic.rendezvous.registryZdistributedZelasticZ
rendezvousregistryrc   Ztorch.distributed.elasticr   r   Z*torch.distributed.elastic.agent.server.apir   Z:torch.distributed.elastic.agent.server.local_elastic_agentr   Z)torch.distributed.elastic.multiprocessingr   r   Z0torch.distributed.elastic.multiprocessing.errorsr   Z$torch.distributed.elastic.rendezvousr   Z*torch.distributed.elastic.rendezvous.utilsr   Z'torch.distributed.elastic.utils.loggingr   r_   r   r;   r7   rO   r5   rX   rB   r.   r.   r.   r/   <module>   s8   $C( 
