diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 501ccd89b..17c056584 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -10,7 +10,7 @@ jobs:
 - job: 'EvalChanges'
   displayName: 'Analyze changed files to determine which job to run'
   pool:
-    vmImage: 'macOS-10.13'
+    vmImage: 'macOS-10.15'
   steps:
   # We want to enforce the following rules for PRs:
   # * if all modifications are to README.md
@@ -144,7 +144,7 @@ jobs:
   variables:
     python.version: '3.6'
   pool:
-    vmImage: 'macOS-10.13'
+    vmImage: 'macOS-10.15'
   steps:
   - template: azure-pipelines-steps.yml
     parameters:
@@ -162,7 +162,7 @@ jobs:
         imageName: 'ubuntu-16.04'
         python.version: '3.5'
       macOS, Python 3.5:
-        imageName: 'macOS-10.13'
+        imageName: 'macOS-10.15'
         python.version: '3.5'
       Windows, Python 3.5:
         imageName: 'vs2017-win2016'
@@ -171,7 +171,7 @@ jobs:
         imageName: 'ubuntu-16.04'
         python.version: '3.6'
       macOS, Python 3.6:
-        imageName: 'macOS-10.13'
+        imageName: 'macOS-10.15'
         python.version: '3.6'
       Windows, Python 3.6:
         imageName: 'vs2017-win2016'
@@ -180,7 +180,7 @@ jobs:
         imageName: 'ubuntu-16.04'
         python.version: '3.7'
       macOS, Python 3.7:
-        imageName: 'macOS-10.13'
+        imageName: 'macOS-10.15'
         python.version: '3.7'
       Windows, Python 3.7:
         imageName: 'vs2017-win2016'
diff --git a/doc/spec/comparison.rst b/doc/spec/comparison.rst
index 5b3d98c82..069121b20 100644
--- a/doc/spec/comparison.rst
+++ b/doc/spec/comparison.rst
@@ -36,60 +36,59 @@ Detailed estimator comparison
 | :class:`.NonParamDMLCateEstimator`          | 1-d/Binary   |              |                  | Yes         |                 | Yes        |              | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 
-.. glossary::
 
-    Treatment Type
-        Some estimators can only estimate effects of particular kinds of treatments. 
-        *Discrete* treatments can be described by a finite number of comprehensive categories (for example, 
-        group A received a 10% discount on product 1, group B received a 10% discount on product 2, group C 
-        received no discounts). *Binary* treatments are a special case of discrete treatments with only two 
-        categories. *Continuous* treatments can take on any value along the number line (for example, minutes of 
-        exercise per week).  
+Treatment Type
+    Some estimators can only estimate effects of particular kinds of treatments. 
+    *Discrete* treatments can be described by a finite number of comprehensive categories (for example, 
+    group A received a 10% discount on product 1, group B received a 10% discount on product 2, group C 
+    received no discounts). *Binary* treatments are a special case of discrete treatments with only two 
+    categories. *Continuous* treatments can take on any value along the number line (for example, minutes of 
+    exercise per week).  
 
-    Requires Instrument
-        Some estimators identify the causal effect of a treatment by considering only a subset of the variation in 
-        treatment intensity that is conditionally random given other data features. This subset of the variation 
-        is driven by an instrument, which is usually some kind of randomization (i.e. an earlier experiment or a 
-        lottery). See the Instrumental Variable Regression section for more information on picking a good 
-        instrument.  
+Requires Instrument
+    Some estimators identify the causal effect of a treatment by considering only a subset of the variation in 
+    treatment intensity that is conditionally random given other data features. This subset of the variation 
+    is driven by an instrument, which is usually some kind of randomization (i.e. an earlier experiment or a 
+    lottery). See the Instrumental Variable Regression section for more information on picking a good 
+    instrument.  
 
-    Delivers Confidence Intervals
-        Many estimators can deliver analytic confidence intervals for the final treatment effects. These 
-        confidence intervals correctly adjust for the reuse of data across multiple stages of estimation. EconML 
-        cannot deliver analytic confidence intervals in cases where this multi-stage estimation is too complex or 
-        for estimators such as the MetaLearners that trade honest confidence intervals for model selection and 
-        regularization. In these cases it is still possible to get bootstrap confidence intervals, but this 
-        process is slow and may not be statistically valid. 
+Delivers Confidence Intervals
+    Many estimators can deliver analytic confidence intervals for the final treatment effects. These 
+    confidence intervals correctly adjust for the reuse of data across multiple stages of estimation. EconML 
+    cannot deliver analytic confidence intervals in cases where this multi-stage estimation is too complex or 
+    for estimators such as the MetaLearners that trade honest confidence intervals for model selection and 
+    regularization. In these cases it is still possible to get bootstrap confidence intervals, but this 
+    process is slow and may not be statistically valid. 
 
-    Linear Treatment
-        Some estimators impose the assumption that the outcome is a linear function of the treatment. These 
-        estimators can also estimate a non-linear relationship between a treatment and the outcome if the 
-        structure of the relationship is known and additively separable (for example, the linear function could 
-        include both treatment and treatment-squared for continuous treatments). These linear functions can also 
-        include specified interactions between treatments. However, these estimators cannot estimate a fully 
-        flexible non-parametric relationship between treatments and the outcome (for example, the relationship 
-        cannot be modeled by a forest). 
+Linear Treatment
+    Some estimators impose the assumption that the outcome is a linear function of the treatment. These 
+    estimators can also estimate a non-linear relationship between a treatment and the outcome if the 
+    structure of the relationship is known and additively separable (for example, the linear function could 
+    include both treatment and treatment-squared for continuous treatments). These linear functions can also 
+    include specified interactions between treatments. However, these estimators cannot estimate a fully 
+    flexible non-parametric relationship between treatments and the outcome (for example, the relationship 
+    cannot be modeled by a forest). 
 
-    Linear Heterogeneity
-        The CATE function determines how the size of a user’s response to the treatment varies by user features. 
-        Some estimators impose the *assumption* that effect size is a linear function of user features. A few models 
-        estimate a more flexible relationship between effect size and user features and then *project* that flexible
-        function onto a linear model. This second approach delivers a better-fitting linear approximation of a 
-        non-linear relationship, but is less efficient in cases where you are confident assuming the true 
-        relationship is linear. Finally, some estimation models allow a fully flexible relationship between 
-        effect size and user features with no linearity structure. 
+Linear Heterogeneity
+    The CATE function determines how the size of a user’s response to the treatment varies by user features. 
+    Some estimators impose the *assumption* that effect size is a linear function of user features. A few models 
+    estimate a more flexible relationship between effect size and user features and then *project* that flexible
+    function onto a linear model. This second approach delivers a better-fitting linear approximation of a 
+    non-linear relationship, but is less efficient in cases where you are confident assuming the true 
+    relationship is linear. Finally, some estimation models allow a fully flexible relationship between 
+    effect size and user features with no linearity structure. 
 
-    Multiple Outcomes
-        Some estimation models allow joint estimation of the effects of treatment(s) on multiple outcomes. Other 
-        models only accommodate a single outcome. 
+Multiple Outcomes
+    Some estimation models allow joint estimation of the effects of treatment(s) on multiple outcomes. Other 
+    models only accommodate a single outcome. 
 
-    Multiple Treatments
-        Some estimation models allow joint estimation of the effects of multiple treatments on outcome(s). Other 
-        models only accommodate a single treatment. 
+Multiple Treatments
+    Some estimation models allow joint estimation of the effects of multiple treatments on outcome(s). Other 
+    models only accommodate a single treatment. 
 
-    High-Dimensional Features
-        Many estimators only behave well with a small set of specified features, X, that affect the size of a 
-        user’s response to the treatment. If you do not already know which few features might reasonably affect 
-        the user’s response, use one of our sparse estimators that can handle large feature sets and penalize them 
-        to discover the features that are most correlated with treatment effect heterogeneity. 
+High-Dimensional Features
+    Many estimators only behave well with a small set of specified features, X, that affect the size of a 
+    user’s response to the treatment. If you do not already know which few features might reasonably affect 
+    the user’s response, use one of our sparse estimators that can handle large feature sets and penalize them 
+    to discover the features that are most correlated with treatment effect heterogeneity. 
 
diff --git a/econml/deepiv.py b/econml/deepiv.py
index 1cc0834ce..8dbbb4bb7 100644
--- a/econml/deepiv.py
+++ b/econml/deepiv.py
@@ -442,3 +442,407 @@ def predict(self, T, X):
             singleton dimension will be collapsed (so this method will return a vector)
         """
         return self._effect_model.predict([T, X]).reshape((-1,) + self._d_y)
+
+
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    from torch.utils.data import DataLoader, TensorDataset
+
+    def _expand(v, n, dim):  # repeat v `n` times along a new dimension dim
+        assert -v.dim() - 1 <= dim <= v.dim() + 1
+        sizes = [n if (i == dim or i == dim + v.dim() + 1) else -1 for i in range(v.dim() + 1)]
+        return v.unsqueeze(dim).expand(*sizes)
+
+    class TorchMogModel(torch.nn.Module):
+        """
+        A mixture of Gaussians model with the specified number of components.
+
+        Parameters
+        ----------
+        n_components : int
+            The number of components in the mixture model
+
+        d_i : int
+            The number of dimensions in the layer used as input
+
+        d_t : int, optional
+            The number of dimensions in the output (or None for a vector output)
+
+        """
+
+        def __init__(self, n_components, d_i, d_t):
+            super().__init__()
+            self._n_components = n_components
+            self._d_t = d_t
+            self.pi = nn.Linear(d_i, n_components)
+            self.mu = nn.Linear(d_i, n_components * (d_t if d_t else 1))
+            self.sig = nn.Linear(d_i, n_components)
+
+        def forward(self, x):
+            return (F.softmax(self.pi(x), dim=-1),
+                    self.mu(x).reshape(*(x.size()[:-1] + (self._n_components,) + ((self._d_t,) if self._d_t else ()))),
+                    torch.exp(self.sig(x)))
+
+    class TorchMogLoss(nn.Module):
+        """Create a module that computes the loss of a mixture of Gaussians model on data."""
+
+        # LL = C - log(sum(pi_i/sig^d * exp(-d2/(2*sig^2))))
+        def forward(self, pi, mu, sig, t):
+            assert pi.size() == sig.size()
+            # with a vector treatment, won't have an extra dim for T
+            assert mu.size() == pi.size() or mu.size()[:-1] == pi.size()
+
+            t_vec = mu.dim() == pi.dim()  # is t a vector instead of an array?
+
+            if t_vec:  # explicitly add singleton treatment dim to simplify remainder of logic
+                assert t.size() == pi.size()[:-1]
+                mu = mu.unsqueeze(-1)
+                t = t.unsqueeze(-1)
+            else:
+                assert t.size() == pi.size()[:-1] + mu.size()[-1]
+
+            d_t = t.size()[-1]
+            t = t.unsqueeze(dim=-2)  # insert a 1 for the n_components dimension
+            # || t - mu_i || ^2
+            d2 = ((t - mu) * (t - mu)).sum(dim=-1)
+            return -torch.log(torch.sum(pi / torch.pow(sig, d_t) * torch.exp(-d2 / (2 * sig * sig)), dim=-1))
+
+    class TorchMogSampleModel(nn.Module):
+        """Create a module that generates samples from a mixture of Gaussians."""
+
+        @torch.no_grad()
+        def forward(self, n_samples, pi, mu, sig):
+            assert pi.size() == sig.size()
+            # with a vector treatment, won't have an extra dim for T
+            assert mu.size() == pi.size() or mu.size()[:-1] == pi.size()
+
+            t_vec = mu.dim() == pi.dim()  # is t a vector instead of an array?
+
+            batch_size = pi.size()[:-1]
+            n_c = pi.size()[-1]
+
+            if t_vec:
+                mu = mu.unsqueeze(-1)  # explicitly add singleton treatment dimension to simplify everything else
+
+            n_t = mu.size()[-1]  # number of treatments
+
+            # expand pi to (... × n_samples × n_components), then reshape to (prod(...)*n_samples × n_components)
+            # since multinomial acts row-wise
+            pi = _expand(pi, n_samples, -2).reshape(-1, n_c)
+            ind = torch.multinomial(pi, 1).reshape(*(batch_size + (n_samples,)))
+
+            # select sig elements corresponding to selected component
+            sig = torch.gather(sig, -1, ind)
+            # to create covariance matrix, need to make a scaled identity matrix in two extra dims
+            sig = sig.unsqueeze(-1).unsqueeze(-1) * torch.eye(n_t)
+
+            # for mu, need to add an additional d_t dim to ind before selecting
+            ind = _expand(ind, mu.size()[-1], dim=-1)
+            mu = torch.gather(mu, -2, ind)
+
+            samples = torch.distributions.MultivariateNormal(mu, scale_tril=sig).sample()
+
+            if t_vec:
+                return samples.squeeze(-1)  # drop added singleton dim
+            return samples
+
+    class TorchResponseLoss(nn.Module):
+        """
+        Torch module that computes the loss of a response model on data.
+
+        Parameters
+        ----------
+        h : Module (with signature (tensor, tensor) -> tensor)
+            Method for generating samples of y given samples of t and x
+
+        sample_t : int -> Tensor
+            Method for getting n samples of t
+
+        x : Tensor
+            Values of x
+
+        y : Tensor
+            Values of y
+
+        samples: int
+            The number of samples to use
+
+        use_upper_bound : bool
+            Whether to use an upper bound to the true loss
+            (equivalent to adding a regularization penalty on the variance of h)
+
+        gradient_samples : int
+            The number of separate additional samples to use when calculating the gradient.
+            This can only be nonzero if user_upper_bound is False, in which case the gradient of
+            the returned loss will be an unbiased estimate of the gradient of the true loss.
+
+        """
+
+        def forward(self, h, sample_t, x, y, samples=50, use_upper_bound=False, gradient_samples=50):
+            assert not (use_upper_bound and gradient_samples)
+
+            # Note that we assume that there is a single batch dimension, so that we expand x and y along dim=1
+            # This is because if x or y is a vector, then expanding along dim=-2 would do the wrong thing
+
+            # generate n samples of t, then take the mean of f(t,x) with that sample and an expanded x
+            def mean(f, n):
+                result = torch.mean(f(sample_t(n), _expand(x, n, dim=1)), dim=1)
+                assert y.size() == result.size()
+                return result
+
+            if gradient_samples:
+                # we want to separately sample the gradient; we use detach to treat the sampled model as constant
+                # the overall computation ensures that we have an interpretable loss (y-h̅(p,x))²,
+                # but also that the gradient is -2(y-h̅(p,x))∇h̅(p,x) with *different* samples used for each average
+                diff = y - mean(h, samples)
+                grad = 2 * mean(h, gradient_samples)
+                return diff.detach() * ((diff + grad).detach() - grad)
+            elif use_upper_bound:
+                # mean of (y-h(p,x))²
+                return mean(lambda t, x: (_expand(y, samples, dim=1) - h(t, x)).pow(2), samples)
+            else:
+                return (y - mean(h, samples)).pow(2)
+
+    class TorchDeepIVEstimator(BaseCateEstimator):
+        """
+        The Deep IV Estimator (see http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf).
+
+        Parameters
+        ----------
+        n_components : int
+            Number of components in the mixture density network
+
+        m : Module (signature (tensor, tensor) -> tensor)
+            Torch module featurizing z and x inputs
+
+        h : Module (signature (tensor, tensor) -> tensor)
+            Torch module returning y given t and x.  This should work on tensors with arbitrary leading dimensions.
+
+        n_samples : int
+            The number of samples to use
+
+        use_upper_bound_loss : bool, optional
+            Whether to use an upper bound to the true loss
+            (equivalent to adding a regularization penalty on the variance of h).
+            Defaults to False.
+
+        n_gradient_samples : int, optional
+            The number of separate additional samples to use when calculating the gradient.
+            This can only be nonzero if user_upper_bound is False, in which case the gradient of
+            the returned loss will be an unbiased estimate of the gradient of the true loss.
+            Defaults to 0.
+
+        optimizer : parameters -> Optimizer
+            The optimizer to use. Defaults to `Adam`
+
+        inference: string, inference method, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of `BootstrapOptions`)
+
+        """
+
+        def __init__(self, n_components, m, h,
+                     n_samples, use_upper_bound_loss=False, n_gradient_samples=0,
+                     first_stage_batch_size=32,
+                     second_stage_batch_size=32,
+                     first_stage_epochs=2,
+                     second_stage_epochs=2,
+                     optimizer=torch.optim.Adam,
+                     inference=None):
+            self._n_components = n_components
+            self._m = m
+            self._h = h
+            self._n_samples = n_samples
+            self._use_upper_bound_loss = use_upper_bound_loss
+            self._n_gradient_samples = n_gradient_samples
+            self._first_stage_batch_size = first_stage_batch_size
+            self._second_stage_batch_size = second_stage_batch_size
+            self._first_stage_epochs = first_stage_epochs
+            self._second_stage_epochs = second_stage_epochs
+            self._optimizer = optimizer
+            super().__init__(inference=inference)
+
+        def _fit_impl(self, Y, T, X, Z):
+            """Estimate the counterfactual model from data.
+
+            That is, estimate functions τ(·, ·, ·), ∂τ(·, ·).
+
+            Parameters
+            ----------
+            Y: (n × d_y) matrix or vector of length n
+                Outcomes for each sample
+            T: (n × dₜ) matrix or vector of length n
+                Treatments for each sample
+            X: optional (n × dₓ) matrix
+                Features for each sample
+            Z: optional (n × d_z) matrix
+                Instruments for each sample
+
+            Returns
+            -------
+            self
+
+            """
+            assert 1 <= np.ndim(X) <= 2
+            assert 1 <= np.ndim(Z) <= 2
+            assert 1 <= np.ndim(T) <= 2
+            assert 1 <= np.ndim(Y) <= 2
+            assert np.shape(X)[0] == np.shape(Y)[0] == np.shape(T)[0] == np.shape(Z)[0]
+            # in case vectors were passed for Y or T, keep track of trailing dims for reshaping effect output
+
+            d_x, d_y, d_z, d_t = [np.shape(a)[1:] for a in [X, Y, Z, T]]
+            self._d_y = d_y
+
+            d_m = self._m(torch.Tensor(np.empty((1,) + d_z)), torch.Tensor(np.empty((1,) + d_x))).size()[1]
+
+            Y, T, X, Z = [torch.from_numpy(A).float() for A in (Y, T, X, Z)]
+            n_components = self._n_components
+
+            treatment_model = self._m
+
+            class Mog(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.treatment_model = treatment_model
+                    self.mog_model = TorchMogModel(n_components, d_m, (d_t if d_t else None))
+
+                def forward(self, z, x):
+                    features = self.treatment_model(z, x)
+                    return self.mog_model(features)
+
+            mog = Mog()
+            self._mog = mog
+            mog.train()
+            opt = self._optimizer(mog.parameters())
+
+            # train first-stage model
+            loader = DataLoader(TensorDataset(T, Z, X), shuffle=True, batch_size=self._first_stage_batch_size)
+            for epoch in range(self._first_stage_epochs):
+                total_loss = 0
+                for i, (t, z, x) in enumerate(loader):
+                    opt.zero_grad()
+                    pi, mu, sig = mog(z, x)
+                    loss = TorchMogLoss()(pi, mu, sig, t).sum()
+                    total_loss += loss.item()
+                    if i % 30 == 0:
+                        print(loss / t.size()[0])
+                    loss.backward()
+                    opt.step()
+                print(f"Average loss for epoch {epoch+1}: {total_loss / len(loader.dataset)}")
+
+            mog.eval()  # set mog to evaluation mode
+            for p in mog.parameters():
+                p.requires_grad_(False)
+
+            self._h.train()
+            opt = self._optimizer(self._h.parameters())
+
+            loader = DataLoader(TensorDataset(Y, Z, X), shuffle=True, batch_size=self._second_stage_batch_size)
+            for epoch in range(self._second_stage_epochs):
+                total_loss = 0
+                for i, (y, z, x) in enumerate(loader):
+                    opt.zero_grad()
+                    pi, mu, sig = mog(z, x)
+                    loss = TorchResponseLoss()(self._h,
+                                               lambda n: TorchMogSampleModel()(n, pi, mu, sig),
+                                               x, y,
+                                               self._n_samples, self._use_upper_bound_loss, self._n_gradient_samples)
+                    loss = loss.sum()
+                    total_loss += loss.item()
+                    if i % 30 == 0:
+                        print(loss / y.size()[0])
+                    loss.backward()
+                    opt.step()
+                print(f"Average loss for epoch {epoch+1}: {total_loss / len(loader.dataset)}")
+
+            self._h.eval()  # set h to evaluation mode
+            for p in self._h.parameters():
+                p.requires_grad_(False)
+
+        def effect(self, X=None, T0=0, T1=1):
+            """
+            Calculate the heterogeneous treatment effect τ(·,·,·).
+
+            The effect is calculated between the two treatment points
+            conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
+
+            Parameters
+            ----------
+            T0: (m × dₜ) matrix
+                Base treatments for each sample
+            T1: (m × dₜ) matrix
+                Target treatments for each sample
+            X: optional (m × dₓ) matrix
+                Features for each sample
+
+            Returns
+            -------
+            τ: (m × d_y) matrix
+                Heterogeneous treatment effects on each outcome for each sample
+                Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+                singleton dimension will be collapsed (so this method will return a vector)
+            """
+            if np.ndim(T0) == 0:
+                T0 = np.repeat(T0, 1 if X is None else np.shape(X)[0])
+            if np.ndim(T1) == 0:
+                T1 = np.repeat(T1, 1 if X is None else np.shape(X)[0])
+            if X is None:
+                X = np.empty((np.shape(T0)[0], 0))
+            return self.predict(T1, X) - self.predict(T0, X)
+
+        def marginal_effect(self, T, X=None):
+            """
+            Calculate the marginal effect ∂τ(·, ·) around a base treatment point conditional on features.
+
+            Parameters
+            ----------
+            T: (m × dₜ) matrix
+                Base treatments for each sample
+            X: optional(m × dₓ) matrix
+                Features for each sample
+
+            Returns
+            -------
+            grad_tau: (m × d_y × dₜ) array
+                Heterogeneous marginal effects on each outcome for each sample
+                Note that when Y or T is a vector rather than a 2-dimensional array,
+                the corresponding singleton dimensions in the output will be collapsed
+                (e.g. if both are vectors, then the output of this method will also be a vector)
+            """
+            if X is None:
+                X = np.empty((np.shape(T0)[0], 0))
+            X, T = [torch.from_numpy(A).float() for A in [X, T]]
+            if self._d_y:
+                X, T = [A.unsqueeze(1).expand((-1,) + self._d_y + (-1,)) for A in [X, T]]
+            T.requires_grad_(True)
+            if self._d_y:
+                self._h(T, X).backward(torch.eye(self._d_y[0]).expand(X.size()[0], -1, -1))
+                return T.grad.numpy()
+            else:
+                self._h(T, X).backward(torch.ones(X.size()[0]))
+                return T.grad.numpy()
+
+        def predict(self, T, X):
+            """Predict outcomes given treatment assignments and features.
+
+            Parameters
+            ----------
+            T: (m × dₜ) matrix
+                Base treatments for each sample
+            X: (m × dₓ) matrix
+                Features for each sample
+
+            Returns
+            -------
+            Y: (m × d_y) matrix
+                Outcomes for each sample
+                Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+                singleton dimension will be collapsed (so this method will return a vector)
+            """
+            X, T = [torch.from_numpy(A).float() for A in [X, T]]
+            return self._h(T, X).numpy().reshape((-1,) + self._d_y)
+
+except ModuleNotFoundError as e:
+    pass
diff --git a/setup.cfg b/setup.cfg
index 65652dd6c..903c161f8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,13 +49,16 @@ install_requires =
     graphviz
     matplotlib < 3.1; python_version <= '3.5'
     matplotlib; python_version > '3.5'
+    llvmlite < 0.32; python_version <= '3.5'
     pandas
 test_suite = econml.tests
 tests_require =
     pytest
     pytest-xdist
     pytest-cov
+    torch >= 1.1
     jupyter
+    nbconvert != 6.0.0a1; python_version <= '3.5'
     seaborn
     lightgbm
     dowhy