diff --git a/pyscf/dft/rks.py b/pyscf/dft/rks.py
index 17e8006d4..29f0acaf9 100644
--- a/pyscf/dft/rks.py
+++ b/pyscf/dft/rks.py
@@ -47,6 +47,11 @@ def _ao_cache_key(mol, grids, nao, deriv, blksize):
             grids.weights.ctypes.data, None if non0tab_key is None else id(non0tab_key),
             grids.coords.shape, grids.weights.shape, nao, deriv, blksize)
 
+def _numint_dm_for_cached_ao(ks, dm):
+    if getattr(dm, 'mo_coeff', None) is not None and getattr(ks, '_numint_ao_cache', None):
+        return numpy.asarray(dm)
+    return dm
+
 def _cached_numint_call(ks, ni, mol, grids, max_memory, fn):
     coords = getattr(grids, 'coords', None)
     weights = getattr(grids, 'weights', None)
@@ -58,6 +63,32 @@ def _cached_numint_call(ks, ni, mol, grids, max_memory, fn):
         ao_cache = ks._numint_ao_cache = {}
 
     block_loop = ni.block_loop
+    get_overlap_cond = mol.get_overlap_cond
+    ao_loc_nr = mol.ao_loc_nr
+    overlap_cond = [None]
+    ao_locs = {}
+
+    def cached_get_overlap_cond(shls_slice=None):
+        if shls_slice is None:
+            if overlap_cond[0] is None:
+                overlap_cond[0] = get_overlap_cond()
+            return overlap_cond[0]
+        return get_overlap_cond(shls_slice)
+
+    def cached_ao_loc_nr(cart=None):
+        if cart not in ao_locs:
+            ao_locs[cart] = ao_loc_nr(cart)
+        return ao_locs[cart]
+
+    sentinel = object()
+    mol_dict = getattr(mol, '__dict__', None)
+    old_get_overlap_cond = old_ao_loc_nr = sentinel
+    patched_mol = mol_dict is not None
+    if patched_mol:
+        old_get_overlap_cond = mol_dict.get('get_overlap_cond', sentinel)
+        old_ao_loc_nr = mol_dict.get('ao_loc_nr', sentinel)
+        mol.get_overlap_cond = cached_get_overlap_cond
+        mol.ao_loc_nr = cached_ao_loc_nr
 
     def cached_block_loop(mol1, grids1, nao=None, deriv=0, max_memory=2000,
                           non0tab=None, blksize=None, buf=None):
@@ -97,6 +128,15 @@ def _cached_numint_call(ks, ni, mol, grids, max_memory, fn):
         return fn()
     finally:
         ni.block_loop = block_loop
+        if patched_mol:
+            if old_get_overlap_cond is sentinel:
+                del mol.get_overlap_cond
+            else:
+                mol.get_overlap_cond = old_get_overlap_cond
+            if old_ao_loc_nr is sentinel:
+                del mol.ao_loc_nr
+            else:
+                mol.ao_loc_nr = old_ao_loc_nr
 
 def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     '''Coulomb + XC functional
@@ -141,9 +181,10 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         n, exc, vxc = 0, 0, 0
     else:
         max_memory = ks.max_memory - lib.current_memory()[0]
+        dm_numint = _numint_dm_for_cached_ao(ks, dm)
         n, exc, vxc = _cached_numint_call(
             ks, ni, mol, ks.grids, max_memory,
-            lambda: ni.nr_rks(mol, ks.grids, ks.xc, dm, max_memory=max_memory))
+            lambda: ni.nr_rks(mol, ks.grids, ks.xc, dm_numint, max_memory=max_memory))
         logger.debug(ks, 'nelec by numeric integration = %s', n)
         if ks.do_nlc():
             if ni.libxc.is_nlc(ks.xc):
@@ -151,7 +192,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             else:
                 assert ni.libxc.is_nlc(ks.nlc)
                 xc = ks.nlc
-            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm_numint,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
diff --git a/pyscf/dft/uks.py b/pyscf/dft/uks.py
index ad4b4ed4d..d04aeb883 100644
--- a/pyscf/dft/uks.py
+++ b/pyscf/dft/uks.py
@@ -25,8 +25,132 @@ import numpy
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf.scf import hf, uhf
+from pyscf.dft import numint
 from pyscf.dft import rks
 
+def _stock_prefers_rho2(ni, mol, dm, mo_occ):
+    ovlp_cond = mol.get_overlap_cond()
+    dm_cond = mol.condense_to_shell(dm, 'absmax')
+    pair_mask = numpy.exp(-ovlp_cond) * dm_cond > ni.cutoff
+    mo_ao_sparsity = max(0.5 * numpy.sum(mo_occ) / dm.shape[-1], 1e-8)
+    ao_loc = mol.ao_loc_nr()
+    wts = (ao_loc[1:] - ao_loc[:-1]) / ao_loc[-1]
+    return numpy.dot(wts, pair_mask).dot(wts) / mo_ao_sparsity >= 4
+
+def _uks_gga_fused_mo(ni, mol, grids, xc_code, dm, hermi):
+    if hermi != 1 or ni._xc_type(xc_code) != 'GGA':
+        return None
+    if not (isinstance(dm, numpy.ndarray) and dm.ndim == 3 and dm.shape[0] == 2):
+        return None
+    if dm.dtype != numpy.double:
+        return None
+    mo_coeff = getattr(dm, 'mo_coeff', None)
+    mo_occ = getattr(dm, 'mo_occ', None)
+    if mo_coeff is None or mo_occ is None:
+        return None
+    if len(mo_coeff) != 2 or len(mo_occ) != 2:
+        return None
+    if mo_coeff[0].ndim != 2 or mo_coeff[1].ndim != 2:
+        return None
+    if mo_coeff[0].shape[0] != dm.shape[-1] or mo_coeff[1].shape[0] != dm.shape[-1]:
+        return None
+    if numpy.any(mo_occ[0] < -numint.OCCDROP) or numpy.any(mo_occ[1] < -numint.OCCDROP):
+        return None
+    if not (_stock_prefers_rho2(ni, mol, dm[0], mo_occ[0]) and
+            _stock_prefers_rho2(ni, mol, dm[1], mo_occ[1])):
+        return None
+    return mo_coeff, mo_occ
+
+def _occupied_coeff(mo_coeff, mo_occ):
+    pos = mo_occ > numint.OCCDROP
+    if numpy.any(pos):
+        return numpy.einsum('ij,j->ij', mo_coeff[:,pos], numpy.sqrt(mo_occ[pos]))
+    return None
+
+def _contract_spin_rho(c0, c1, ngrids, scale=1):
+    if c0.shape[1] == 0:
+        return numpy.zeros(ngrids)
+    rho = numint._contract_rho(c0, c1)
+    if scale != 1:
+        rho *= scale
+    return rho
+
+def _eval_rho2_gga_pair(mol, ao, mo_coeff, mo_occ, non0tab=None):
+    ngrids = ao.shape[-2]
+    cposa = _occupied_coeff(mo_coeff[0], mo_occ[0])
+    cposb = _occupied_coeff(mo_coeff[1], mo_occ[1])
+    if cposa is None and cposb is None:
+        return numpy.zeros((4, ngrids)), numpy.zeros((4, ngrids))
+
+    if cposa is None:
+        nmoa = 0
+        cpos = cposb
+    elif cposb is None:
+        nmoa = cposa.shape[1]
+        cpos = cposa
+    else:
+        nmoa = cposa.shape[1]
+        cpos = numpy.hstack((cposa, cposb))
+
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+    rhoa = numpy.empty((4, ngrids))
+    rhob = numpy.empty((4, ngrids))
+
+    c0 = numint._dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
+    c0a = c0[:, :nmoa]
+    c0b = c0[:, nmoa:]
+    rhoa[0] = _contract_spin_rho(c0a, c0a, ngrids)
+    rhob[0] = _contract_spin_rho(c0b, c0b, ngrids)
+    for i in range(1, 4):
+        c1 = numint._dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
+        rhoa[i] = _contract_spin_rho(c0a, c1[:, :nmoa], ngrids, 2)
+        rhob[i] = _contract_spin_rho(c0b, c1[:, nmoa:], ngrids, 2)
+    return rhoa, rhob
+
+def _nr_uks_gga_fused(ni, mol, grids, xc_code, dm, hermi=1, max_memory=2000):
+    fused_mo = _uks_gga_fused_mo(ni, mol, grids, xc_code, dm, hermi)
+    if fused_mo is None:
+        return None
+    mo_coeff, mo_occ = fused_mo
+
+    nao = dm.shape[-1]
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = numint.NBINS * 2 - int(numint.NBINS * numpy.log(cutoff) /
+                                   numpy.log(grids.cutoff))
+    pair_mask = mol.get_overlap_cond() < -numpy.log(ni.cutoff)
+
+    nelec = numpy.zeros(2)
+    excsum = 0
+    vmat = numpy.zeros((2, nao, nao))
+    aow = None
+    for ao, mask, weight, coords in ni.block_loop(mol, grids, nao, 1,
+                                                  max_memory=max_memory):
+        rho_a, rho_b = _eval_rho2_gga_pair(mol, ao, mo_coeff, mo_occ, mask)
+        exc, vxc = ni.eval_xc_eff(xc_code, (rho_a, rho_b), deriv=1,
+                                  xctype='GGA', spin=1)[:2]
+        den_a = rho_a[0] * weight
+        den_b = rho_b[0] * weight
+        nelec[0] += den_a.sum()
+        nelec[1] += den_b.sum()
+        excsum += numpy.dot(den_a, exc)
+        excsum += numpy.dot(den_b, exc)
+
+        wv = weight * vxc
+        wv[:,0] *= .5
+        wva, wvb = wv
+        aow = numint._scale_ao_sparse(ao, wva, mask, ao_loc, out=aow)
+        numint._dot_ao_ao_sparse(ao[0], aow, None, nbins, mask, pair_mask,
+                                 ao_loc, hermi=0, out=vmat[0])
+        aow = numint._scale_ao_sparse(ao, wvb, mask, ao_loc, out=aow)
+        numint._dot_ao_ao_sparse(ao[0], aow, None, nbins, mask, pair_mask,
+                                 ao_loc, hermi=0, out=vmat[1])
+    vmat = lib.hermi_sum(vmat, axes=(0,2,1))
+    if vmat.dtype != dm.dtype:
+        vmat = numpy.asarray(vmat, dtype=dm.dtype)
+    return nelec, excsum, vmat
+
 def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     '''Coulomb + XC functional for UKS.  See pyscf/dft/rks.py
     :func:`get_veff` fore more details.
@@ -49,9 +173,17 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         n, exc, vxc = (0,0), 0, 0
     else:
         max_memory = ks.max_memory - lib.current_memory()[0]
+        dm_numint = rks._numint_dm_for_cached_ao(ks, dm)
+        def nr_uks_call():
+            out = _nr_uks_gga_fused(ni, mol, ks.grids, ks.xc, dm, hermi,
+                                    max_memory=max_memory)
+            if out is not None:
+                return out
+            return ni.nr_uks(mol, ks.grids, ks.xc, dm_numint,
+                             max_memory=max_memory)
         n, exc, vxc = rks._cached_numint_call(
             ks, ni, mol, ks.grids, max_memory,
-            lambda: ni.nr_uks(mol, ks.grids, ks.xc, dm, max_memory=max_memory))
+            nr_uks_call)
         logger.debug(ks, 'nelec by numeric integration = %s', n)
         if ks.do_nlc():
             if ni.libxc.is_nlc(ks.xc):
@@ -59,7 +191,8 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             else:
                 assert ni.libxc.is_nlc(ks.nlc)
                 xc = ks.nlc
-            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm[0]+dm[1],
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc,
+                                          dm_numint[0]+dm_numint[1],
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc