From a097977df4d9af3a5e4e1e6e484f8ed2a99ef81d Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sun, 16 Jul 2017 21:04:16 +0200
Subject: [PATCH 01/37] import numpy as np ; from pandas import Panel

Also commented out the last line!!

$ __flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics__

```
./bench/bench_join_panel.py:37:26: F821 undefined name 'np'
                values = np.concatenate([p.values for p in panels], axis=1)
                         ^

./bench/bench_join_panel.py:42:13: F821 undefined name 'Panel'
        p = Panel(values, items=items, major_axis=major,
            ^

./bench/bench_join_panel.py:79:16: F821 undefined name 'np'
        data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
               ^

./bench/bench_join_panel.py:79:27: F821 undefined name 'np'
        data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
                          ^

./bench/bench_join_panel.py:79:39: F821 undefined name 'np'
        data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
                                      ^

./bench/bench_join_panel.py:79:83: F821 undefined name 'np'
        data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
                                                                                  ^

./bench/bench_join_panel.py:84:16: F821 undefined name 'Panel'
        return Panel(data, items, major, minor)
               ^

./bench/bench_join_panel.py:85:1: F821 undefined name 'add_class_method'
add_class_method(Panel, create_panels_join, 'join_many')
^

./bench/bench_join_panel.py:85:18: F821 undefined name 'Panel'
add_class_method(Panel, create_panels_join, 'join_many')
                 ^
```
---
 bench/bench_join_panel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py
index f3c3f8ba15f70..c16ff907efdb4 100644
--- a/bench/bench_join_panel.py
+++ b/bench/bench_join_panel.py
@@ -1,5 +1,8 @@
 # reasonably efficient
 
+import numpy as np
+from pandas import Panel
+
 
 def create_panels_append(cls, panels):
         """ return an append list of panels """
@@ -82,4 +85,4 @@ def create_panels_join(cls, panels):
                           for minor_i in minor])
         # construct the panel
         return Panel(data, items, major, minor)
-add_class_method(Panel, create_panels_join, 'join_many')
+# add_class_method(Panel, create_panels_join, 'join_many')

From fb5ef7169ee217cdbefedada9321baaf74cee77e Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sun, 16 Jul 2017 21:13:48 +0200
Subject: [PATCH 02/37] from pandas.compat import string_types, text_type

$ __flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics__

```
./scripts/find_commits_touching_func.py:97:12: F821 undefined name 'compat'
    return compat.text_type(r).split(sep)
           ^

./scripts/find_commits_touching_func.py:186:35: F821 undefined name 'compat'
    if isinstance(args.file_masks,compat.string_types):
                                  ^

./scripts/find_commits_touching_func.py:188:35: F821 undefined name 'compat'
    if isinstance(args.path_masks,compat.string_types):
                                  ^

./scripts/find_commits_touching_func.py:190:34: F821 undefined name 'compat'
    if isinstance(args.dir_masks,compat.string_types):
                                 ^
```
---
 scripts/find_commits_touching_func.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py
index 099761f38bb44..74ea120bf0b64 100755
--- a/scripts/find_commits_touching_func.py
+++ b/scripts/find_commits_touching_func.py
@@ -4,7 +4,7 @@
 # copryright 2013, y-p @ github
 
 from __future__ import print_function
-from pandas.compat import range, lrange, map
+from pandas.compat import range, lrange, map, string_types, text_type
 
 """Search the git history for all commits touching a named method
 
@@ -94,7 +94,7 @@ def get_hits(defname,files=()):
 
 def get_commit_info(c,fmt,sep='\t'):
     r=sh.git('log', "--format={}".format(fmt), '{}^..{}'.format(c,c),"-n","1",_tty_out=False)
-    return compat.text_type(r).split(sep)
+    return text_type(r).split(sep)
 
 def get_commit_vitals(c,hlen=HASH_LEN):
     h,s,d= get_commit_info(c,'%H\t%s\t%ci',"\t")
@@ -183,11 +183,11 @@ def main():
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 """)
         return
-    if isinstance(args.file_masks,compat.string_types):
+    if isinstance(args.file_masks, string_types):
         args.file_masks = args.file_masks.split(',')
-    if isinstance(args.path_masks,compat.string_types):
+    if isinstance(args.path_masks, string_types):
         args.path_masks = args.path_masks.split(',')
-    if isinstance(args.dir_masks,compat.string_types):
+    if isinstance(args.dir_masks, string_types):
         args.dir_masks = args.dir_masks.split(',')
 
     logger.setLevel(getattr(logging,args.debug_level))

From e70995aa1a1989a993145fa68dab8d69a533d11e Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sun, 16 Jul 2017 21:21:30 +0200
Subject: [PATCH 03/37] password=############ is a Python Syntax Error

If the syntax error is intentional then an alternative that will get by some linters would be:
`    password=############  # noqa`

$ __flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics__

```
./scripts/windows_builder/check_and_build.py:52:21: E999 SyntaxError: invalid syntax
password=############
                    ^
```
---
 scripts/windows_builder/check_and_build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/windows_builder/check_and_build.py b/scripts/windows_builder/check_and_build.py
index 2eb32fb4265d9..2726b20dfea9d 100644
--- a/scripts/windows_builder/check_and_build.py
+++ b/scripts/windows_builder/check_and_build.py
@@ -49,7 +49,7 @@
 base_dir = "C:\Users\Jeff Reback\Documents\GitHub\pandas"
 remote_host='pandas.pydata.org'
 username='pandas'
-password=############
+password='############'
 
 # drop python from our environment to avoid
 # passing this onto sub-processes

From eb02e035e8e7b8a15f5707d99fd3238260c85dd6 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sun, 16 Jul 2017 21:25:48 +0200
Subject: [PATCH 04/37] spaces around operator

---
 scripts/windows_builder/check_and_build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/windows_builder/check_and_build.py b/scripts/windows_builder/check_and_build.py
index 2726b20dfea9d..4a555e5c16b09 100644
--- a/scripts/windows_builder/check_and_build.py
+++ b/scripts/windows_builder/check_and_build.py
@@ -49,7 +49,7 @@
 base_dir = "C:\Users\Jeff Reback\Documents\GitHub\pandas"
 remote_host='pandas.pydata.org'
 username='pandas'
-password='############'
+password = '############'
 
 # drop python from our environment to avoid
 # passing this onto sub-processes

From e46e85e5b984ec4d4e1f939d20209e002d4c0ce3 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 08:58:10 +0200
Subject: [PATCH 05/37] print() function

---
 asv_bench/vbench_to_asv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py
index c3041ec2b1ba1..2a4ce5d183ea2 100644
--- a/asv_bench/vbench_to_asv.py
+++ b/asv_bench/vbench_to_asv.py
@@ -114,7 +114,7 @@ def translate_module(target_module):
     l_vars = {}
     exec('import ' + target_module) in g_vars
 
-    print target_module
+    print(target_module)
     module = eval(target_module, g_vars)
 
     benchmarks = []
@@ -157,7 +157,7 @@ def translate_module(target_module):
         mod = os.path.basename(module)
         if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']:
             continue
-        print
-        print mod
+        print('')
+        print(mod)
 
         translate_module(mod.replace('.py', ''))

From 082c503594bb2f56256c353897b2f9ba5671f269 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 09:03:40 +0200
Subject: [PATCH 06/37] raw_input = input in Python 3

---
 doc/source/conf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index 394fa44c30573..cb3063d59beae 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -17,6 +17,11 @@
 import importlib
 from pandas.compat import u, PY3
 
+try:
+    raw_input          # Python 2
+except NameError:
+    raw_input = input  # Python 3
+
 # https://github.com/sphinx-doc/sphinx/pull/2325/files
 # Workaround for sphinx-build recursion limit overflow:
 # pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL)

From b5a1ef22feb36fffdaf8e7f6841907b613bdc42e Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 09:58:19 +0200
Subject: [PATCH 07/37] from pandas.compat import text_type

---
 doc/sphinxext/ipython_sphinxext/ipython_directive.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py
index 49fbacba99592..922767a8e2d46 100644
--- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py
+++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py
@@ -111,7 +111,7 @@
 import sys
 import tempfile
 import ast
-from pandas.compat import zip, range, map, lmap, u, cStringIO as StringIO
+from pandas.compat import zip, range, map, lmap, u, text_type, cStringIO as StringIO
 import warnings
 
 # To keep compatibility with various python versions
@@ -138,10 +138,8 @@
 
 if PY3:
     from io import StringIO
-    text_type = str
 else:
     from StringIO import StringIO
-    text_type = unicode
 
 #-----------------------------------------------------------------------------
 # Globals

From b5a323d298b7c5f3fcbff54ac3a8fd2bac94118e Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:52:56 +0200
Subject: [PATCH 08/37] Delete alignment.py

---
 bench/alignment.py | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 bench/alignment.py

diff --git a/bench/alignment.py b/bench/alignment.py
deleted file mode 100644
index bc3134f597ee0..0000000000000
--- a/bench/alignment.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Setup
-from pandas.compat import range, lrange
-import numpy as np
-import pandas
-import la
-N = 1000
-K = 50
-arr1 = np.random.randn(N, K)
-arr2 = np.random.randn(N, K)
-idx1 = lrange(N)
-idx2 = lrange(K)
-
-# pandas
-dma1 = pandas.DataFrame(arr1, idx1, idx2)
-dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1])
-
-# larry
-lar1 = la.larry(arr1, [idx1, idx2])
-lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]])
-
-for i in range(100):
-    result = lar1 + lar2

From 12d26dff2f110805779d78de982bfb3dc8f15581 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:53:06 +0200
Subject: [PATCH 09/37] Delete bench_dense_to_sparse.py

---
 bench/bench_dense_to_sparse.py | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 bench/bench_dense_to_sparse.py

diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py
deleted file mode 100644
index e1dcd3456e88d..0000000000000
--- a/bench/bench_dense_to_sparse.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from pandas import *
-
-K = 100
-N = 100000
-rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute())
-
-rng2 = np.asarray(rng).astype('M8[us]').astype('i8')
-
-series = {}
-for i in range(1, K + 1):
-    data = np.random.randn(N)[:-i]
-    this_rng = rng2[:-i]
-    data[100:] = np.nan
-    series[i] = SparseSeries(data, index=this_rng)

From 4e353fa72ac9ead4bf3ec28e2fe95d92ec02b470 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:53:17 +0200
Subject: [PATCH 10/37] Delete bench_get_put_value.py

---
 bench/bench_get_put_value.py | 56 ------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 bench/bench_get_put_value.py

diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py
deleted file mode 100644
index 427e0b1b10a22..0000000000000
--- a/bench/bench_get_put_value.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from pandas import *
-from pandas.util.testing import rands
-from pandas.compat import range
-
-N = 1000
-K = 50
-
-
-def _random_index(howmany):
-    return Index([rands(10) for _ in range(howmany)])
-
-df = DataFrame(np.random.randn(N, K), index=_random_index(N),
-               columns=_random_index(K))
-
-
-def get1():
-    for col in df.columns:
-        for row in df.index:
-            _ = df[col][row]
-
-
-def get2():
-    for col in df.columns:
-        for row in df.index:
-            _ = df.get_value(row, col)
-
-
-def put1():
-    for col in df.columns:
-        for row in df.index:
-            df[col][row] = 0
-
-
-def put2():
-    for col in df.columns:
-        for row in df.index:
-            df.set_value(row, col, 0)
-
-
-def resize1():
-    buf = DataFrame()
-    for col in df.columns:
-        for row in df.index:
-            buf = buf.set_value(row, col, 5.)
-    return buf
-
-
-def resize2():
-    from collections import defaultdict
-
-    buf = defaultdict(dict)
-    for col in df.columns:
-        for row in df.index:
-            buf[col][row] = 5.
-
-    return DataFrame(buf)

From 4eb125c05682cb9f3b0d2e3ed9d396b4be75eead Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:53:27 +0200
Subject: [PATCH 11/37] Delete bench_groupby.py

---
 bench/bench_groupby.py | 66 ------------------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 bench/bench_groupby.py

diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py
deleted file mode 100644
index d7a2853e1e7b2..0000000000000
--- a/bench/bench_groupby.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from pandas import *
-from pandas.util.testing import rands
-from pandas.compat import range
-
-import string
-import random
-
-k = 20000
-n = 10
-
-foo = np.tile(np.array([rands(10) for _ in range(k)], dtype='O'), n)
-foo2 = list(foo)
-random.shuffle(foo)
-random.shuffle(foo2)
-
-df = DataFrame({'A': foo,
-                'B': foo2,
-                'C': np.random.randn(n * k)})
-
-import pandas._sandbox as sbx
-
-
-def f():
-    table = sbx.StringHashTable(len(df))
-    ret = table.factorize(df['A'])
-    return ret
-
-
-def g():
-    table = sbx.PyObjectHashTable(len(df))
-    ret = table.factorize(df['A'])
-    return ret
-
-ret = f()
-
-"""
-import pandas._tseries as lib
-
-f = np.std
-
-
-grouped = df.groupby(['A', 'B'])
-
-label_list = [ping.labels for ping in grouped.groupings]
-shape = [len(ping.ids) for ping in grouped.groupings]
-
-from pandas.core.groupby import get_group_index
-
-
-group_index = get_group_index(label_list, shape,
-                              sort=True, xnull=True).astype('i4')
-
-ngroups = np.prod(shape)
-
-indexer = lib.groupsort_indexer(group_index, ngroups)
-
-values = df['C'].values.take(indexer)
-group_index = group_index.take(indexer)
-
-f = lambda x: x.std(ddof=1)
-
-grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups)
-result = grouper.get_result()
-
-expected = grouped.std()
-"""

From db9df8d52124e22701ea845cb20e6f5f964b9289 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:53:37 +0200
Subject: [PATCH 12/37] Delete bench_join_panel.py

---
 bench/bench_join_panel.py | 88 ---------------------------------------
 1 file changed, 88 deletions(-)
 delete mode 100644 bench/bench_join_panel.py

diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py
deleted file mode 100644
index c16ff907efdb4..0000000000000
--- a/bench/bench_join_panel.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# reasonably efficient
-
-import numpy as np
-from pandas import Panel
-
-
-def create_panels_append(cls, panels):
-        """ return an append list of panels """
-        panels = [a for a in panels if a is not None]
-        # corner cases
-        if len(panels) == 0:
-                return None
-        elif len(panels) == 1:
-                return panels[0]
-        elif len(panels) == 2 and panels[0] == panels[1]:
-                return panels[0]
-        # import pdb; pdb.set_trace()
-        # create a joint index for the axis
-
-        def joint_index_for_axis(panels, axis):
-                s = set()
-                for p in panels:
-                        s.update(list(getattr(p, axis)))
-                return sorted(list(s))
-
-        def reindex_on_axis(panels, axis, axis_reindex):
-                new_axis = joint_index_for_axis(panels, axis)
-                new_panels = [p.reindex(**{axis_reindex: new_axis,
-                                        'copy': False}) for p in panels]
-                return new_panels, new_axis
-        # create the joint major index, dont' reindex the sub-panels - we are
-        # appending
-        major = joint_index_for_axis(panels, 'major_axis')
-        # reindex on minor axis
-        panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor')
-        # reindex on items
-        panels, items = reindex_on_axis(panels, 'items', 'items')
-        # concatenate values
-        try:
-                values = np.concatenate([p.values for p in panels], axis=1)
-        except Exception as detail:
-                raise Exception("cannot append values that dont' match dimensions! -> [%s] %s"
-                                % (','.join(["%s" % p for p in panels]), str(detail)))
-        # pm('append - create_panel')
-        p = Panel(values, items=items, major_axis=major,
-                  minor_axis=minor)
-        # pm('append - done')
-        return p
-
-
-# does the job but inefficient (better to handle like you read a table in
-# pytables...e.g create a LongPanel then convert to Wide)
-def create_panels_join(cls, panels):
-        """ given an array of panels's, create a single panel """
-        panels = [a for a in panels if a is not None]
-        # corner cases
-        if len(panels) == 0:
-                return None
-        elif len(panels) == 1:
-                return panels[0]
-        elif len(panels) == 2 and panels[0] == panels[1]:
-                return panels[0]
-        d = dict()
-        minor, major, items = set(), set(), set()
-        for panel in panels:
-                items.update(panel.items)
-                major.update(panel.major_axis)
-                minor.update(panel.minor_axis)
-                values = panel.values
-                for item, item_index in panel.items.indexMap.items():
-                        for minor_i, minor_index in panel.minor_axis.indexMap.items():
-                                for major_i, major_index in panel.major_axis.indexMap.items():
-                                        try:
-                                                d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index]
-                                        except:
-                                                pass
-        # stack the values
-        minor = sorted(list(minor))
-        major = sorted(list(major))
-        items = sorted(list(items))
-        # create the 3d stack (items x columns x indicies)
-        data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
-                                                  for item in items])
-                                      for major_i in major]).transpose()
-                          for minor_i in minor])
-        # construct the panel
-        return Panel(data, items, major, minor)
-# add_class_method(Panel, create_panels_join, 'join_many')

From c0c24e495b1791713cc22ae2e17e243b45bdddb7 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:53:48 +0200
Subject: [PATCH 13/37] Delete bench_khash_dict.py

---
 bench/bench_khash_dict.py | 89 ---------------------------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 bench/bench_khash_dict.py

diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py
deleted file mode 100644
index 054fc36131b65..0000000000000
--- a/bench/bench_khash_dict.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-Some comparisons of khash.h to Python dict
-"""
-from __future__ import print_function
-
-import numpy as np
-import os
-
-from vbench.api import Benchmark
-from pandas.util.testing import rands
-from pandas.compat import range
-import pandas._tseries as lib
-import pandas._sandbox as sbx
-import time
-
-import psutil
-
-pid = os.getpid()
-proc = psutil.Process(pid)
-
-
-def object_test_data(n):
-    pass
-
-
-def string_test_data(n):
-    return np.array([rands(10) for _ in range(n)], dtype='O')
-
-
-def int_test_data(n):
-    return np.arange(n, dtype='i8')
-
-N = 1000000
-
-#----------------------------------------------------------------------
-# Benchmark 1: map_locations
-
-
-def map_locations_python_object():
-    arr = string_test_data(N)
-    return _timeit(lambda: lib.map_indices_object(arr))
-
-
-def map_locations_khash_object():
-    arr = string_test_data(N)
-
-    def f():
-        table = sbx.PyObjectHashTable(len(arr))
-        table.map_locations(arr)
-    return _timeit(f)
-
-
-def _timeit(f, iterations=10):
-    start = time.time()
-    for _ in range(iterations):
-        foo = f()
-    elapsed = time.time() - start
-    return elapsed
-
-#----------------------------------------------------------------------
-# Benchmark 2: lookup_locations
-
-
-def lookup_python(values):
-    table = lib.map_indices_object(values)
-    return _timeit(lambda: lib.merge_indexer_object(values, table))
-
-
-def lookup_khash(values):
-    table = sbx.PyObjectHashTable(len(values))
-    table.map_locations(values)
-    locs = table.lookup_locations(values)
-    # elapsed = _timeit(lambda: table.lookup_locations2(values))
-    return table
-
-
-def leak(values):
-    for _ in range(100):
-        print(proc.get_memory_info())
-        table = lookup_khash(values)
-        # table.destroy()
-
-arr = string_test_data(N)
-
-#----------------------------------------------------------------------
-# Benchmark 3: unique
-
-#----------------------------------------------------------------------
-# Benchmark 4: factorize

From 179dbe3d7da4e0574f00d0e6e5e7c6898565918b Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:54:00 +0200
Subject: [PATCH 14/37] Delete bench_merge.R

---
 bench/bench_merge.R | 161 --------------------------------------------
 1 file changed, 161 deletions(-)
 delete mode 100644 bench/bench_merge.R

diff --git a/bench/bench_merge.R b/bench/bench_merge.R
deleted file mode 100644
index 3ed4618494857..0000000000000
--- a/bench/bench_merge.R
+++ /dev/null
@@ -1,161 +0,0 @@
-library(plyr)
-library(data.table)
-N <- 10000
-indices = rep(NA, N)
-indices2 = rep(NA, N)
-for (i in 1:N) {
-  indices[i] <- paste(sample(letters, 10), collapse="")
-  indices2[i] <- paste(sample(letters, 10), collapse="")
-}
-left <- data.frame(key=rep(indices[1:8000], 10),
-                   key2=rep(indices2[1:8000], 10),
-                   value=rnorm(80000))
-right <- data.frame(key=indices[2001:10000],
-                    key2=indices2[2001:10000],
-                    value2=rnorm(8000))
-
-right2 <- data.frame(key=rep(right$key, 2),
-                     key2=rep(right$key2, 2),
-                     value2=rnorm(16000))
-
-left.dt <- data.table(left, key=c("key", "key2"))
-right.dt <- data.table(right, key=c("key", "key2"))
-right2.dt <- data.table(right2, key=c("key", "key2"))
-
-# left.dt2 <- data.table(left)
-# right.dt2 <- data.table(right)
-
-## left <- data.frame(key=rep(indices[1:1000], 10),
-##                    key2=rep(indices2[1:1000], 10),
-##                    value=rnorm(100000))
-## right <- data.frame(key=indices[1:1000],
-##                     key2=indices2[1:1000],
-##                     value2=rnorm(10000))
-
-timeit <- function(func, niter=10) {
-  timing = rep(NA, niter)
-  for (i in 1:niter) {
-    gc()
-    timing[i] <- system.time(func())[3]
-  }
-  mean(timing)
-}
-
-left.join <- function(sort=FALSE) {
-  result <- base::merge(left, right, all.x=TRUE, sort=sort)
-}
-
-right.join <- function(sort=FALSE) {
-  result <- base::merge(left, right, all.y=TRUE, sort=sort)
-}
-
-outer.join <- function(sort=FALSE) {
-  result <- base::merge(left, right, all=TRUE, sort=sort)
-}
-
-inner.join <- function(sort=FALSE) {
-  result <- base::merge(left, right, all=FALSE, sort=sort)
-}
-
-left.join.dt <- function(sort=FALSE) {
-  result <- right.dt[left.dt]
-}
-
-right.join.dt <- function(sort=FALSE) {
-  result <- left.dt[right.dt]
-}
-
-outer.join.dt <- function(sort=FALSE) {
-  result <- merge(left.dt, right.dt, all=TRUE, sort=sort)
-}
-
-inner.join.dt <- function(sort=FALSE) {
-  result <- merge(left.dt, right.dt, all=FALSE, sort=sort)
-}
-
-plyr.join <- function(type) {
-  result <- plyr::join(left, right, by=c("key", "key2"),
-                       type=type, match="first")
-}
-
-sort.options <- c(FALSE, TRUE)
-
-# many-to-one
-
-results <- matrix(nrow=4, ncol=3)
-colnames(results) <- c("base::merge", "plyr", "data.table")
-rownames(results) <- c("inner", "outer", "left", "right")
-
-base.functions <- c(inner.join, outer.join, left.join, right.join)
-plyr.functions <- c(function() plyr.join("inner"),
-                    function() plyr.join("full"),
-                    function() plyr.join("left"),
-					function() plyr.join("right"))
-dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
-for (i in 1:4) {
-  base.func <- base.functions[[i]]
-  plyr.func <- plyr.functions[[i]]
-  dt.func <- dt.functions[[i]]
-  results[i, 1] <- timeit(base.func)
-  results[i, 2] <- timeit(plyr.func)
-  results[i, 3] <- timeit(dt.func)
-}
-
-
-# many-to-many
-
-left.join <- function(sort=FALSE) {
-  result <- base::merge(left, right2, all.x=TRUE, sort=sort)
-}
-
-right.join <- function(sort=FALSE) {
-  result <- base::merge(left, right2, all.y=TRUE, sort=sort)
-}
-
-outer.join <- function(sort=FALSE) {
-  result <- base::merge(left, right2, all=TRUE, sort=sort)
-}
-
-inner.join <- function(sort=FALSE) {
-  result <- base::merge(left, right2, all=FALSE, sort=sort)
-}
-
-left.join.dt <- function(sort=FALSE) {
-  result <- right2.dt[left.dt]
-}
-
-right.join.dt <- function(sort=FALSE) {
-  result <- left.dt[right2.dt]
-}
-
-outer.join.dt <- function(sort=FALSE) {
-  result <- merge(left.dt, right2.dt, all=TRUE, sort=sort)
-}
-
-inner.join.dt <- function(sort=FALSE) {
-  result <- merge(left.dt, right2.dt, all=FALSE, sort=sort)
-}
-
-sort.options <- c(FALSE, TRUE)
-
-# many-to-one
-
-results <- matrix(nrow=4, ncol=3)
-colnames(results) <- c("base::merge", "plyr", "data.table")
-rownames(results) <- c("inner", "outer", "left", "right")
-
-base.functions <- c(inner.join, outer.join, left.join, right.join)
-plyr.functions <- c(function() plyr.join("inner"),
-                    function() plyr.join("full"),
-                    function() plyr.join("left"),
-					function() plyr.join("right"))
-dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
-for (i in 1:4) {
-  base.func <- base.functions[[i]]
-  plyr.func <- plyr.functions[[i]]
-  dt.func <- dt.functions[[i]]
-  results[i, 1] <- timeit(base.func)
-  results[i, 2] <- timeit(plyr.func)
-  results[i, 3] <- timeit(dt.func)
-}
-

From aca919d5ab97c8c58b3353113d8384556ecccc35 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:54:13 +0200
Subject: [PATCH 15/37] Delete bench_merge.py

---
 bench/bench_merge.py | 105 -------------------------------------------
 1 file changed, 105 deletions(-)
 delete mode 100644 bench/bench_merge.py

diff --git a/bench/bench_merge.py b/bench/bench_merge.py
deleted file mode 100644
index 330dba7b9af69..0000000000000
--- a/bench/bench_merge.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import random
-import gc
-import time
-from pandas import *
-from pandas.compat import range, lrange, StringIO
-from pandas.util.testing import rands
-
-N = 10000
-ngroups = 10
-
-
-def get_test_data(ngroups=100, n=N):
-    unique_groups = lrange(ngroups)
-    arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
-
-    if len(arr) < n:
-        arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
-                         dtype=object)
-
-    random.shuffle(arr)
-    return arr
-
-# aggregate multiple columns
-# df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
-#                 'key2' : get_test_data(ngroups=ngroups),
-#                 'data1' : np.random.randn(N),
-#                 'data2' : np.random.randn(N)})
-
-# df2 = DataFrame({'key1'  : get_test_data(ngroups=ngroups, n=N//10),
-#                  'key2'  : get_test_data(ngroups=ngroups//2, n=N//10),
-#                  'value' : np.random.randn(N // 10)})
-# result = merge.merge(df, df2, on='key2')
-
-N = 10000
-
-indices = np.array([rands(10) for _ in range(N)], dtype='O')
-indices2 = np.array([rands(10) for _ in range(N)], dtype='O')
-key = np.tile(indices[:8000], 10)
-key2 = np.tile(indices2[:8000], 10)
-
-left = DataFrame({'key': key, 'key2': key2,
-                  'value': np.random.randn(80000)})
-right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
-                   'value2': np.random.randn(8000)})
-
-right2 = right.append(right, ignore_index=True)
-
-
-join_methods = ['inner', 'outer', 'left', 'right']
-results = DataFrame(index=join_methods, columns=[False, True])
-niter = 10
-for sort in [False, True]:
-    for join_method in join_methods:
-        f = lambda: merge(left, right, how=join_method, sort=sort)
-        gc.disable()
-        start = time.time()
-        for _ in range(niter):
-            f()
-        elapsed = (time.time() - start) / niter
-        gc.enable()
-        results[sort][join_method] = elapsed
-# results.columns = ['pandas']
-results.columns = ['dont_sort', 'sort']
-
-
-# R results
-# many to one
-r_results = read_table(StringIO("""      base::merge   plyr data.table
-inner      0.2475 0.1183     0.1100
-outer      0.4213 0.1916     0.2090
-left       0.2998 0.1188     0.0572
-right      0.3102 0.0536     0.0376
-"""), sep='\s+')
-
-presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'})
-all_results = presults.join(r_results)
-
-all_results = all_results.div(all_results['pandas'], axis=0)
-
-all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
-                                 'base::merge']]
-
-sort_results = DataFrame.from_items([('pandas', results['sort']),
-                                     ('R', r_results['base::merge'])])
-sort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
-
-
-nosort_results = DataFrame.from_items([('pandas', results['dont_sort']),
-                                       ('R', r_results['base::merge'])])
-nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas']
-
-# many to many
-
-# many to one
-r_results = read_table(StringIO("""base::merge   plyr data.table
-inner      0.4610 0.1276     0.1269
-outer      0.9195 0.1881     0.2725
-left       0.6559 0.1257     0.0678
-right      0.6425 0.0522     0.0428
-"""), sep='\s+')
-
-all_results = presults.join(r_results)
-all_results = all_results.div(all_results['pandas'], axis=0)
-all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
-                                 'base::merge']]

From 67bdf7c42b4d40826ad7372236e67999560d674f Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:54:23 +0200
Subject: [PATCH 16/37] Delete bench_merge_sqlite.py

---
 bench/bench_merge_sqlite.py | 87 -------------------------------------
 1 file changed, 87 deletions(-)
 delete mode 100644 bench/bench_merge_sqlite.py

diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py
deleted file mode 100644
index 3ad4b810119c3..0000000000000
--- a/bench/bench_merge_sqlite.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import numpy as np
-from collections import defaultdict
-import gc
-import time
-from pandas import DataFrame
-from pandas.util.testing import rands
-from pandas.compat import range, zip
-import random
-
-N = 10000
-
-indices = np.array([rands(10) for _ in range(N)], dtype='O')
-indices2 = np.array([rands(10) for _ in range(N)], dtype='O')
-key = np.tile(indices[:8000], 10)
-key2 = np.tile(indices2[:8000], 10)
-
-left = DataFrame({'key': key, 'key2': key2,
-                  'value': np.random.randn(80000)})
-right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
-                   'value2': np.random.randn(8000)})
-
-# right2 = right.append(right, ignore_index=True)
-# right = right2
-
-# random.shuffle(key2)
-# indices2 = indices.copy()
-# random.shuffle(indices2)
-
-# Prepare Database
-import sqlite3
-create_sql_indexes = True
-
-conn = sqlite3.connect(':memory:')
-conn.execute(
-    'create table left( key varchar(10), key2 varchar(10), value int);')
-conn.execute(
-    'create table right( key varchar(10), key2 varchar(10), value2 int);')
-conn.executemany('insert into left values (?, ?, ?)',
-                 zip(key, key2, left['value']))
-conn.executemany('insert into right values (?, ?, ?)',
-                 zip(right['key'], right['key2'], right['value2']))
-
-# Create Indices
-if create_sql_indexes:
-    conn.execute('create index left_ix on left(key, key2)')
-    conn.execute('create index right_ix on right(key, key2)')
-
-
-join_methods = ['inner', 'left outer', 'left']  # others not supported
-sql_results = DataFrame(index=join_methods, columns=[False])
-niter = 5
-for sort in [False]:
-    for join_method in join_methods:
-        sql = """CREATE TABLE test as select *
-        from left
-           %s join right
-             on left.key=right.key
-               and left.key2 = right.key2;""" % join_method
-        sql = """select *
-        from left
-           %s join right
-             on left.key=right.key
-               and left.key2 = right.key2;""" % join_method
-
-        if sort:
-            sql = '%s order by key, key2' % sql
-        f = lambda: list(conn.execute(sql))  # list fetches results
-        g = lambda: conn.execute(sql)  # list fetches results
-        gc.disable()
-        start = time.time()
-        # for _ in range(niter):
-        g()
-        elapsed = (time.time() - start) / niter
-        gc.enable()
-
-        cur = conn.execute("DROP TABLE test")
-        conn.commit()
-
-        sql_results[sort][join_method] = elapsed
-        sql_results.columns = ['sqlite3']  # ['dont_sort', 'sort']
-        sql_results.index = ['inner', 'outer', 'left']
-
-        sql = """select *
-        from left
-           inner join right
-             on left.key=right.key
-               and left.key2 = right.key2;"""

From 007bf4444b1ce41e221bf57f4a299918599849a7 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:54:33 +0200
Subject: [PATCH 17/37] Delete bench_pivot.R

---
 bench/bench_pivot.R | 27 ---------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 bench/bench_pivot.R

diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R
deleted file mode 100644
index 06dc6a105bc43..0000000000000
--- a/bench/bench_pivot.R
+++ /dev/null
@@ -1,27 +0,0 @@
-library(reshape2)
-
-
-n <- 100000
-a.size <- 5
-b.size <- 5
-
-data <- data.frame(a=sample(letters[1:a.size], n, replace=T),
-                   b=sample(letters[1:b.size], n, replace=T),
-                   c=rnorm(n),
-                   d=rnorm(n))
-
-timings <- numeric()
-
-# acast(melt(data, id=c("a", "b")), a ~ b, mean)
-# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean)
-
-for (i in 1:10) {
-  gc()
-  tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean,
-                           subset=.(variable=="c")))
-  timings[i] = tim[3]
-}
-
-mean(timings)
-
-acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c"))

From ec714df9989a5af1c83637aaa1f8f9963cd122ea Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:54:46 +0200
Subject: [PATCH 18/37] Delete bench_pivot.py

---
 bench/bench_pivot.py | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 bench/bench_pivot.py

diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py
deleted file mode 100644
index 007bd0aaebc2f..0000000000000
--- a/bench/bench_pivot.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from pandas import *
-import string
-
-
-n = 100000
-asize = 5
-bsize = 5
-
-letters = np.asarray(list(string.letters), dtype=object)
-
-data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)],
-                      bar=letters[:bsize][np.random.randint(0, bsize, n)],
-                      baz=np.random.randn(n),
-                      qux=np.random.randn(n)))
-
-table = pivot_table(data, xby=['foo', 'bar'])

From 504464ae1bc036884a8cfe338aa3e773c060529b Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:55:08 +0200
Subject: [PATCH 19/37] Delete bench_take_indexing.py

---
 bench/bench_take_indexing.py | 55 ------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 bench/bench_take_indexing.py

diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py
deleted file mode 100644
index 5fb584bcfe45f..0000000000000
--- a/bench/bench_take_indexing.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from __future__ import print_function
-import numpy as np
-
-from pandas import *
-import pandas._tseries as lib
-
-from pandas import DataFrame
-import timeit
-from pandas.compat import zip
-
-setup = """
-from pandas import Series
-import pandas._tseries as lib
-import random
-import numpy as np
-
-import random
-n = %d
-k = %d
-arr = np.random.randn(n, k)
-indexer = np.arange(n, dtype=np.int32)
-indexer = indexer[::-1]
-"""
-
-sizes = [100, 1000, 10000, 100000]
-iters = [1000, 1000, 100, 1]
-
-fancy_2d = []
-take_2d = []
-cython_2d = []
-
-n = 1000
-
-
-def _timeit(stmt, size, k=5, iters=1000):
-    timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k))
-    return timer.timeit(n) / n
-
-for sz, its in zip(sizes, iters):
-    print(sz)
-    fancy_2d.append(_timeit('arr[indexer]', sz, iters=its))
-    take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its))
-    cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its))
-
-df = DataFrame({'fancy': fancy_2d,
-                'take': take_2d,
-                'cython': cython_2d})
-
-print(df)
-
-from pandas.rpy.common import r
-r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)')
-r('set.seed(12345')
-r('indexer <- sample(1:10000)')
-r('mat[indexer,]')

From e7aca5c56157e100a61597623cefb6d0e863f7e8 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:55:22 +0200
Subject: [PATCH 20/37] Delete bench_unique.py

---
 bench/bench_unique.py | 278 ------------------------------------------
 1 file changed, 278 deletions(-)
 delete mode 100644 bench/bench_unique.py

diff --git a/bench/bench_unique.py b/bench/bench_unique.py
deleted file mode 100644
index 87bd2f2df586c..0000000000000
--- a/bench/bench_unique.py
+++ /dev/null
@@ -1,278 +0,0 @@
-from __future__ import print_function
-from pandas import *
-from pandas.util.testing import rands
-from pandas.compat import range, zip
-import pandas._tseries as lib
-import numpy as np
-import matplotlib.pyplot as plt
-
-N = 50000
-K = 10000
-
-groups = np.array([rands(10) for _ in range(K)], dtype='O')
-groups2 = np.array([rands(10) for _ in range(K)], dtype='O')
-
-labels = np.tile(groups, N // K)
-labels2 = np.tile(groups2, N // K)
-data = np.random.randn(N)
-
-
-def timeit(f, niter):
-    import gc
-    import time
-    gc.disable()
-    start = time.time()
-    for _ in range(niter):
-        f()
-    elapsed = (time.time() - start) / niter
-    gc.enable()
-    return elapsed
-
-
-def algo1():
-    unique_labels = np.unique(labels)
-    result = np.empty(len(unique_labels))
-    for i, label in enumerate(unique_labels):
-        result[i] = data[labels == label].sum()
-
-
-def algo2():
-    unique_labels = np.unique(labels)
-    indices = lib.groupby_indices(labels)
-    result = np.empty(len(unique_labels))
-
-    for i, label in enumerate(unique_labels):
-        result[i] = data.take(indices[label]).sum()
-
-
-def algo3_nosort():
-    rizer = lib.DictFactorizer()
-    labs, counts = rizer.factorize(labels, sort=False)
-    k = len(rizer.uniques)
-    out = np.empty(k)
-    lib.group_add(out, counts, data, labs)
-
-
-def algo3_sort():
-    rizer = lib.DictFactorizer()
-    labs, counts = rizer.factorize(labels, sort=True)
-    k = len(rizer.uniques)
-    out = np.empty(k)
-    lib.group_add(out, counts, data, labs)
-
-import numpy as np
-import random
-
-
-# dict to hold results
-counts = {}
-
-# a hack to generate random key, value pairs.
-# 5k keys, 100k values
-x = np.tile(np.arange(5000, dtype='O'), 20)
-random.shuffle(x)
-xarr = x
-x = [int(y) for y in x]
-data = np.random.uniform(0, 1, 100000)
-
-
-def f():
-    # groupby sum
-    for k, v in zip(x, data):
-        try:
-            counts[k] += v
-        except KeyError:
-            counts[k] = v
-
-
-def f2():
-    rizer = lib.DictFactorizer()
-    labs, counts = rizer.factorize(xarr, sort=False)
-    k = len(rizer.uniques)
-    out = np.empty(k)
-    lib.group_add(out, counts, data, labs)
-
-
-def algo4():
-    rizer = lib.DictFactorizer()
-    labs1, _ = rizer.factorize(labels, sort=False)
-    k1 = len(rizer.uniques)
-
-    rizer = lib.DictFactorizer()
-    labs2, _ = rizer.factorize(labels2, sort=False)
-    k2 = len(rizer.uniques)
-
-    group_id = labs1 * k2 + labs2
-    max_group = k1 * k2
-
-    if max_group > 1e6:
-        rizer = lib.Int64Factorizer(len(group_id))
-        group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True)
-        max_group = len(rizer.uniques)
-
-    out = np.empty(max_group)
-    counts = np.zeros(max_group, dtype='i4')
-    lib.group_add(out, counts, data, group_id)
-
-# cumtime  percall filename:lineno(function)
-#   0.592    0.592 <string>:1(<module>)
-  # 0.584    0.006 groupby_ex.py:37(algo3_nosort)
-  # 0.535    0.005 {method 'factorize' of DictFactorizer' objects}
-  # 0.047    0.000 {pandas._tseries.group_add}
-  # 0.002    0.000 numeric.py:65(zeros_like)
-  # 0.001    0.000 {method 'fill' of 'numpy.ndarray' objects}
-  # 0.000    0.000 {numpy.core.multiarray.empty_like}
-  # 0.000    0.000 {numpy.core.multiarray.empty}
-
-# UNIQUE timings
-
-# N = 10000000
-# K = 500000
-
-# groups = np.array([rands(10) for _ in range(K)], dtype='O')
-
-# labels = np.tile(groups, N // K)
-data = np.random.randn(N)
-
-data = np.random.randn(N)
-
-Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000]
-
-# Ks = [500000, 1000000, 2500000, 5000000, 10000000]
-
-import psutil
-import os
-import gc
-
-pid = os.getpid()
-proc = psutil.Process(pid)
-
-
-def dict_unique(values, expected_K, sort=False, memory=False):
-    if memory:
-        gc.collect()
-        before_mem = proc.get_memory_info().rss
-
-    rizer = lib.DictFactorizer()
-    result = rizer.unique_int64(values)
-
-    if memory:
-        result = proc.get_memory_info().rss - before_mem
-        return result
-
-    if sort:
-        result.sort()
-    assert(len(result) == expected_K)
-    return result
-
-
-def khash_unique(values, expected_K, size_hint=False, sort=False,
-                 memory=False):
-    if memory:
-        gc.collect()
-        before_mem = proc.get_memory_info().rss
-
-    if size_hint:
-        rizer = lib.Factorizer(len(values))
-    else:
-        rizer = lib.Factorizer(100)
-
-    result = []
-    result = rizer.unique(values)
-
-    if memory:
-        result = proc.get_memory_info().rss - before_mem
-        return result
-
-    if sort:
-        result.sort()
-    assert(len(result) == expected_K)
-
-
-def khash_unique_str(values, expected_K, size_hint=False, sort=False,
-                     memory=False):
-    if memory:
-        gc.collect()
-        before_mem = proc.get_memory_info().rss
-
-    if size_hint:
-        rizer = lib.StringHashTable(len(values))
-    else:
-        rizer = lib.StringHashTable(100)
-
-    result = []
-    result = rizer.unique(values)
-
-    if memory:
-        result = proc.get_memory_info().rss - before_mem
-        return result
-
-    if sort:
-        result.sort()
-    assert(len(result) == expected_K)
-
-
-def khash_unique_int64(values, expected_K, size_hint=False, sort=False):
-    if size_hint:
-        rizer = lib.Int64HashTable(len(values))
-    else:
-        rizer = lib.Int64HashTable(100)
-
-    result = []
-    result = rizer.unique(values)
-
-    if sort:
-        result.sort()
-    assert(len(result) == expected_K)
-
-
-def hash_bench():
-    numpy = []
-    dict_based = []
-    dict_based_sort = []
-    khash_hint = []
-    khash_nohint = []
-    for K in Ks:
-        print(K)
-        # groups = np.array([rands(10) for _ in range(K)])
-        # labels = np.tile(groups, N // K).astype('O')
-
-        groups = np.random.randint(0, long(100000000000), size=K)
-        labels = np.tile(groups, N // K)
-        dict_based.append(timeit(lambda: dict_unique(labels, K), 20))
-        khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20))
-        khash_hint.append(timeit(lambda: khash_unique_int64(labels, K,
-                                                            size_hint=True), 20))
-
-        # memory, hard to get
-        # dict_based.append(np.mean([dict_unique(labels, K, memory=True)
-        #                            for _ in range(10)]))
-        # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True)
-        #                              for _ in range(10)]))
-        # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True)
-        #                            for _ in range(10)]))
-
-        # dict_based_sort.append(timeit(lambda: dict_unique(labels, K,
-        #                                                   sort=True), 10))
-        # numpy.append(timeit(lambda: np.unique(labels), 10))
-
-    # unique_timings = DataFrame({'numpy.unique' : numpy,
-    #                             'dict, no sort' : dict_based,
-    #                             'dict, sort' : dict_based_sort},
-    #                            columns=['dict, no sort',
-    #                                     'dict, sort', 'numpy.unique'],
-    #                            index=Ks)
-
-    unique_timings = DataFrame({'dict': dict_based,
-                                'khash, preallocate': khash_hint,
-                                'khash': khash_nohint},
-                               columns=['khash, preallocate', 'khash', 'dict'],
-                               index=Ks)
-
-    unique_timings.plot(kind='bar', legend=False)
-    plt.legend(loc='best')
-    plt.title('Unique on 100,000 values, int64')
-    plt.xlabel('Number of unique labels')
-    plt.ylabel('Mean execution time')
-
-    plt.show()

From b4abe49e539f6705e197d085e809a661682cad89 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:56:09 +0200
Subject: [PATCH 21/37] Delete bench_with_subset.py

---
 bench/bench_with_subset.py | 116 -------------------------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 bench/bench_with_subset.py

diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py
deleted file mode 100644
index 017401df3f7f3..0000000000000
--- a/bench/bench_with_subset.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Microbenchmarks for comparison with R's "with" and "subset" functions
-"""
-
-from __future__ import print_function
-import numpy as np
-from numpy import array
-from timeit import repeat as timeit
-from pandas.compat import range, zip
-from pandas import DataFrame
-
-
-setup_common = """from pandas import DataFrame
-from numpy.random import randn
-df = DataFrame(randn(%d, 3), columns=list('abc'))
-%s"""
-
-
-setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
-
-
-def bench_with(n, times=10, repeat=3, engine='numexpr'):
-    return np.array(timeit('df.eval(s, engine=%r)' % engine,
-                           setup=setup_common % (n, setup_with),
-                           repeat=repeat, number=times)) / times
-
-
-setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
-
-
-def bench_subset(n, times=10, repeat=3, engine='numexpr'):
-    return np.array(timeit('df.query(s, engine=%r)' % engine,
-                           setup=setup_common % (n, setup_subset),
-                           repeat=repeat, number=times)) / times
-
-
-def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False):
-    r = np.logspace(mn, mx, num=num).round().astype(int)
-
-    ev = DataFrame(np.empty((num, len(engines))), columns=engines)
-    qu = ev.copy(deep=True)
-
-    ev['size'] = qu['size'] = r
-
-    for engine in engines:
-        for i, n in enumerate(r):
-            if verbose:
-                print('engine: %r, i == %d' % (engine, i))
-            ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine)
-            qu.loc[i, engine] = bench_subset(n, times=1, repeat=1,
-                                             engine=engine)
-
-    return ev, qu
-
-
-def plot_perf(df, engines, title, filename=None):
-    from matplotlib.pyplot import figure, rc
-
-    try:
-        from mpltools import style
-    except ImportError:
-        pass
-    else:
-        style.use('ggplot')
-
-    rc('text', usetex=True)
-
-    fig = figure(figsize=(4, 3), dpi=100)
-    ax = fig.add_subplot(111)
-
-    for engine in engines:
-        ax.plot(df.size, df[engine], label=engine, lw=2)
-
-    ax.set_xlabel('Number of Rows')
-    ax.set_ylabel('Time (s)')
-    ax.set_title(title)
-    ax.legend(loc='best')
-    ax.tick_params(top=False, right=False)
-
-    fig.tight_layout()
-
-    if filename is not None:
-        fig.savefig(filename)
-
-
-if __name__ == '__main__':
-    import os
-    import pandas as pd
-
-    pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
-    static_path = os.path.join(pandas_dir, 'doc', 'source', '_static')
-
-    join = lambda p: os.path.join(static_path, p)
-
-    fn = join('eval-query-perf-data.h5')
-
-    engines = 'python', 'numexpr'
-
-    if not os.path.exists(fn):
-        ev, qu = bench(verbose=True)
-        ev.to_hdf(fn, 'eval')
-        qu.to_hdf(fn, 'query')
-    else:
-        ev = pd.read_hdf(fn, 'eval')
-        qu = pd.read_hdf(fn, 'query')
-
-    plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png'))
-    plot_perf(qu, engines, 'DataFrame.query()',
-              filename=join('query-perf.png'))
-
-    plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()',
-              filename=join('eval-perf-small.png'))
-    plot_perf(qu[qu.size <= 500000], engines, 'DataFrame.query()',
-              filename=join('query-perf-small.png'))

From eb39c27f1f0b3806a9f5f496a1d9be745041d32a Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:56:21 +0200
Subject: [PATCH 22/37] Delete bench_with_subset.R

---
 bench/bench_with_subset.R | 53 ---------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 bench/bench_with_subset.R

diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R
deleted file mode 100644
index 69d0f7a9eec63..0000000000000
--- a/bench/bench_with_subset.R
+++ /dev/null
@@ -1,53 +0,0 @@
-library(microbenchmark)
-library(data.table)
-
-
-data.frame.subset.bench <- function (n=1e7, times=30) {
-    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
-    print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
-                         times=times))
-}
-
-
-# data.table allows something very similar to query with an expression
-# but we have chained comparisons AND we're faster BOO YAH!
-data.table.subset.expression.bench <- function (n=1e7, times=30) {
-    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
-    print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c],
-                         times=times))
-}
-
-
-# compare against subset with data.table for good measure
-data.table.subset.bench <- function (n=1e7, times=30) {
-    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
-    print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
-                         times=times))
-}
-
-
-data.frame.with.bench <- function (n=1e7, times=30) {
-    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
-
-    print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
-                         times=times))
-}
-
-
-data.table.with.bench <- function (n=1e7, times=30) {
-    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
-    print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
-                         times=times))
-}
-
-
-bench <- function () {
-    data.frame.subset.bench()
-    data.table.subset.expression.bench()
-    data.table.subset.bench()
-    data.frame.with.bench()
-    data.table.with.bench()
-}
-
-
-bench()

From 041a22a4e673102bc07aed1de53a50e90bd5e5ba Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:56:35 +0200
Subject: [PATCH 23/37] Delete better_unique.py

---
 bench/better_unique.py | 80 ------------------------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 bench/better_unique.py

diff --git a/bench/better_unique.py b/bench/better_unique.py
deleted file mode 100644
index e03a4f433ce66..0000000000000
--- a/bench/better_unique.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from __future__ import print_function
-from pandas import DataFrame
-from pandas.compat import range, zip
-import timeit
-
-setup = """
-from pandas import Series
-import pandas._tseries as _tseries
-from pandas.compat import range
-import random
-import numpy as np
-
-def better_unique(values):
-    uniques = _tseries.fast_unique(values)
-    id_map = _tseries.map_indices_buf(uniques)
-    labels = _tseries.get_unique_labels(values, id_map)
-    return uniques, labels
-
-tot = 100000
-
-def get_test_data(ngroups=100, n=tot):
-    unique_groups = range(ngroups)
-    random.shuffle(unique_groups)
-    arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
-
-    if len(arr) < n:
-        arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
-                         dtype=object)
-
-    return arr
-
-arr = get_test_data(ngroups=%d)
-"""
-
-group_sizes = [10, 100, 1000, 10000,
-               20000, 30000, 40000,
-               50000, 60000, 70000,
-               80000, 90000, 100000]
-
-numbers = [100, 100, 50] + [10] * 10
-
-numpy = []
-wes = []
-
-for sz, n in zip(group_sizes, numbers):
-    # wes_timer =  timeit.Timer(stmt='better_unique(arr)',
-    #                           setup=setup % sz)
-    wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)',
-                             setup=setup % sz)
-
-    numpy_timer = timeit.Timer(stmt='np.unique(arr)',
-                               setup=setup % sz)
-
-    print(n)
-    numpy_result = numpy_timer.timeit(number=n) / n
-    wes_result = wes_timer.timeit(number=n) / n
-
-    print('Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result))
-
-    wes.append(wes_result)
-    numpy.append(numpy_result)
-
-result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes)
-
-
-def make_plot(numpy, wes):
-    pass
-
-# def get_test_data(ngroups=100, n=100000):
-#     unique_groups = range(ngroups)
-#     random.shuffle(unique_groups)
-#     arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
-
-#     if len(arr) < n:
-#         arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
-#                          dtype=object)
-
-#     return arr
-
-# arr = get_test_data(ngroups=1000)

From 74860eebc51e407f379a10fda85b8f73b80b182c Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:56:45 +0200
Subject: [PATCH 24/37] Delete duplicated.R

---
 bench/duplicated.R | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 bench/duplicated.R

diff --git a/bench/duplicated.R b/bench/duplicated.R
deleted file mode 100644
index eb2376df2932a..0000000000000
--- a/bench/duplicated.R
+++ /dev/null
@@ -1,22 +0,0 @@
-N <- 100000
-
-k1 = rep(NA, N)
-k2 = rep(NA, N)
-for (i in 1:N){
-  k1[i] <- paste(sample(letters, 1), collapse="")
-  k2[i] <- paste(sample(letters, 1), collapse="")
-}
-df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100))
-df2 <- data.frame(a=k1, b=k2)
-
-timings <- numeric()
-timings2 <- numeric()
-for (i in 1:50) {
-  gc()
-  timings[i] = system.time(deduped <- df[!duplicated(df),])[3]
-  gc()
-  timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3]
-}
-
-mean(timings)
-mean(timings2)

From 5702c5ecd4a0a9b64cef1c32e52586e6a339d9c9 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:56:56 +0200
Subject: [PATCH 25/37] Delete io_roundtrip.py

---
 bench/io_roundtrip.py | 116 ------------------------------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 bench/io_roundtrip.py

diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py
deleted file mode 100644
index d87da0ec6321a..0000000000000
--- a/bench/io_roundtrip.py
+++ /dev/null
@@ -1,116 +0,0 @@
-from __future__ import print_function
-import time
-import os
-import numpy as np
-
-import la
-import pandas
-from pandas.compat import range
-from pandas import datetools, DatetimeIndex
-
-
-def timeit(f, iterations):
-    start = time.clock()
-
-    for i in range(iterations):
-        f()
-
-    return time.clock() - start
-
-
-def rountrip_archive(N, K=50, iterations=10):
-    # Create data
-    arr = np.random.randn(N, K)
-    # lar = la.larry(arr)
-    dma = pandas.DataFrame(arr,
-                           DatetimeIndex('1/1/2000', periods=N,
-                                     offset=datetools.Minute()))
-    dma[201] = 'bar'
-
-    # filenames
-    filename_numpy = '/Users/wesm/tmp/numpy.npz'
-    filename_larry = '/Users/wesm/tmp/archive.hdf5'
-    filename_pandas = '/Users/wesm/tmp/pandas_tmp'
-
-    # Delete old files
-    try:
-        os.unlink(filename_numpy)
-    except:
-        pass
-    try:
-        os.unlink(filename_larry)
-    except:
-        pass
-
-    try:
-        os.unlink(filename_pandas)
-    except:
-        pass
-
-    # Time a round trip save and load
-    # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr)
-    # numpy_time = timeit(numpy_f, iterations) / iterations
-
-    # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar)
-    # larry_time = timeit(larry_f, iterations) / iterations
-
-    pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
-    pandas_time = timeit(pandas_f, iterations) / iterations
-    print('pandas (HDF5) %7.4f seconds' % pandas_time)
-
-    pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
-    pickle_time = timeit(pickle_f, iterations) / iterations
-    print('pandas (pickle) %7.4f seconds' % pickle_time)
-
-    # print('Numpy (npz)   %7.4f seconds' % numpy_time)
-    # print('larry (HDF5)  %7.4f seconds' % larry_time)
-
-    # Delete old files
-    try:
-        os.unlink(filename_numpy)
-    except:
-        pass
-    try:
-        os.unlink(filename_larry)
-    except:
-        pass
-
-    try:
-        os.unlink(filename_pandas)
-    except:
-        pass
-
-
-def numpy_roundtrip(filename, arr1, arr2):
-    np.savez(filename, arr1=arr1, arr2=arr2)
-    npz = np.load(filename)
-    arr1 = npz['arr1']
-    arr2 = npz['arr2']
-
-
-def larry_roundtrip(filename, lar1, lar2):
-    io = la.IO(filename)
-    io['lar1'] = lar1
-    io['lar2'] = lar2
-    lar1 = io['lar1']
-    lar2 = io['lar2']
-
-
-def pandas_roundtrip(filename, dma1, dma2):
-    # What's the best way to code this?
-    from pandas.io.pytables import HDFStore
-    store = HDFStore(filename)
-    store['dma1'] = dma1
-    store['dma2'] = dma2
-    dma1 = store['dma1']
-    dma2 = store['dma2']
-
-
-def pandas_roundtrip_pickle(filename, dma1, dma2):
-    dma1.save(filename)
-    dma1 = pandas.DataFrame.load(filename)
-    dma2.save(filename)
-    dma2 = pandas.DataFrame.load(filename)
-
-if __name__ == '__main__':
-    rountrip_archive(10000, K=200)

From 6d798c1c7dd667e87d04d27d07b61c01f3c16846 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 12:57:18 +0200
Subject: [PATCH 26/37] Delete larry.py

---
 bench/larry.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 bench/larry.py

diff --git a/bench/larry.py b/bench/larry.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 2e2af72ab22d12747227bad5ed78bb88ce8b0926 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:35:36 +0200
Subject: [PATCH 27/37] Delete serialize.py

---
 bench/serialize.py | 89 ----------------------------------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 bench/serialize.py

diff --git a/bench/serialize.py b/bench/serialize.py
deleted file mode 100644
index b0edd6a5752d2..0000000000000
--- a/bench/serialize.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from __future__ import print_function
-from pandas.compat import range, lrange
-import time
-import os
-import numpy as np
-
-import la
-import pandas
-
-
-def timeit(f, iterations):
-    start = time.clock()
-
-    for i in range(iterations):
-        f()
-
-    return time.clock() - start
-
-
-def roundtrip_archive(N, iterations=10):
-
-    # Create data
-    arr = np.random.randn(N, N)
-    lar = la.larry(arr)
-    dma = pandas.DataFrame(arr, lrange(N), lrange(N))
-
-    # filenames
-    filename_numpy = '/Users/wesm/tmp/numpy.npz'
-    filename_larry = '/Users/wesm/tmp/archive.hdf5'
-    filename_pandas = '/Users/wesm/tmp/pandas_tmp'
-
-    # Delete old files
-    try:
-        os.unlink(filename_numpy)
-    except:
-        pass
-    try:
-        os.unlink(filename_larry)
-    except:
-        pass
-    try:
-        os.unlink(filename_pandas)
-    except:
-        pass
-
-    # Time a round trip save and load
-    numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr)
-    numpy_time = timeit(numpy_f, iterations) / iterations
-
-    larry_f = lambda: larry_roundtrip(filename_larry, lar, lar)
-    larry_time = timeit(larry_f, iterations) / iterations
-
-    pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
-    pandas_time = timeit(pandas_f, iterations) / iterations
-
-    print('Numpy (npz)   %7.4f seconds' % numpy_time)
-    print('larry (HDF5)  %7.4f seconds' % larry_time)
-    print('pandas (HDF5) %7.4f seconds' % pandas_time)
-
-
-def numpy_roundtrip(filename, arr1, arr2):
-    np.savez(filename, arr1=arr1, arr2=arr2)
-    npz = np.load(filename)
-    arr1 = npz['arr1']
-    arr2 = npz['arr2']
-
-
-def larry_roundtrip(filename, lar1, lar2):
-    io = la.IO(filename)
-    io['lar1'] = lar1
-    io['lar2'] = lar2
-    lar1 = io['lar1']
-    lar2 = io['lar2']
-
-
-def pandas_roundtrip(filename, dma1, dma2):
-    from pandas.io.pytables import HDFStore
-    store = HDFStore(filename)
-    store['dma1'] = dma1
-    store['dma2'] = dma2
-    dma1 = store['dma1']
-    dma2 = store['dma2']
-
-
-def pandas_roundtrip_pickle(filename, dma1, dma2):
-    dma1.save(filename)
-    dma1 = pandas.DataFrame.load(filename)
-    dma2.save(filename)
-    dma2 = pandas.DataFrame.load(filename)

From 29cddf2f5bf0b8df1bc92709d5bf5cede7a43d0a Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:36:02 +0200
Subject: [PATCH 28/37] Delete test.py

---
 bench/test.py | 70 ---------------------------------------------------
 1 file changed, 70 deletions(-)
 delete mode 100644 bench/test.py

diff --git a/bench/test.py b/bench/test.py
deleted file mode 100644
index 2339deab313a1..0000000000000
--- a/bench/test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import numpy as np
-import itertools
-import collections
-import scipy.ndimage as ndi
-from pandas.compat import zip, range
-
-N = 10000
-
-lat = np.random.randint(0, 360, N)
-lon = np.random.randint(0, 360, N)
-data = np.random.randn(N)
-
-
-def groupby1(lat, lon, data):
-    indexer = np.lexsort((lon, lat))
-    lat = lat.take(indexer)
-    lon = lon.take(indexer)
-    sorted_data = data.take(indexer)
-
-    keys = 1000. * lat + lon
-    unique_keys = np.unique(keys)
-    bounds = keys.searchsorted(unique_keys)
-
-    result = group_agg(sorted_data, bounds, lambda x: x.mean())
-
-    decoder = keys.searchsorted(unique_keys)
-
-    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
-
-
-def group_mean(lat, lon, data):
-    indexer = np.lexsort((lon, lat))
-    lat = lat.take(indexer)
-    lon = lon.take(indexer)
-    sorted_data = data.take(indexer)
-
-    keys = 1000 * lat + lon
-    unique_keys = np.unique(keys)
-
-    result = ndi.mean(sorted_data, labels=keys, index=unique_keys)
-    decoder = keys.searchsorted(unique_keys)
-
-    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
-
-
-def group_mean_naive(lat, lon, data):
-    grouped = collections.defaultdict(list)
-    for lt, ln, da in zip(lat, lon, data):
-        grouped[(lt, ln)].append(da)
-
-    averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items())
-
-    return averaged
-
-
-def group_agg(values, bounds, f):
-    N = len(values)
-    result = np.empty(len(bounds), dtype=float)
-    for i, left_bound in enumerate(bounds):
-        if i == len(bounds) - 1:
-            right_bound = N
-        else:
-            right_bound = bounds[i + 1]
-
-        result[i] = f(values[left_bound: right_bound])
-
-    return result
-
-# for i in range(10):
-#     groupby1(lat, lon, data)

From a6e8445271cf9cdcb4312f7d70f6618d79f2adff Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:36:24 +0200
Subject: [PATCH 29/37] Delete zoo_bench.R

---
 bench/zoo_bench.R | 71 -----------------------------------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 bench/zoo_bench.R

diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R
deleted file mode 100644
index 294d55f51a9ab..0000000000000
--- a/bench/zoo_bench.R
+++ /dev/null
@@ -1,71 +0,0 @@
-library(zoo)
-library(xts)
-library(fts)
-library(tseries)
-library(its)
-library(xtable)
-
-## indices = rep(NA, 100000)
-## for (i in 1:100000)
-##   indices[i] <- paste(sample(letters, 10), collapse="")
-
-
-
-## x <- zoo(rnorm(100000), indices)
-## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)])
-
-## indices <- as.POSIXct(1:100000)
-
-indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100)
-
-sz <- 500000
-
-## x <- xts(rnorm(sz), sample(indices, sz))
-## y <- xts(rnorm(sz), sample(indices, sz))
-
-zoo.bench <- function(){
-    x <- zoo(rnorm(sz), sample(indices, sz))
-    y <- zoo(rnorm(sz), sample(indices, sz))
-    timeit(function() {x + y})
-}
-
-xts.bench <- function(){
-    x <- xts(rnorm(sz), sample(indices, sz))
-    y <- xts(rnorm(sz), sample(indices, sz))
-    timeit(function() {x + y})
-}
-
-fts.bench <- function(){
-    x <- fts(rnorm(sz), sort(sample(indices, sz)))
-    y <- fts(rnorm(sz), sort(sample(indices, sz))
-    timeit(function() {x + y})
-}
-
-its.bench <- function(){
-    x <- its(rnorm(sz), sort(sample(indices, sz)))
-    y <- its(rnorm(sz), sort(sample(indices, sz)))
-    timeit(function() {x + y})
-}
-
-irts.bench <- function(){
-    x <- irts(sort(sample(indices, sz)), rnorm(sz))
-    y <- irts(sort(sample(indices, sz)), rnorm(sz))
-    timeit(function() {x + y})
-}
-
-timeit <- function(f){
-  timings <- numeric()
-  for (i in 1:10) {
-    gc()
-    timings[i] = system.time(f())[3]
-  }
-  mean(timings)
-}
-
-bench <- function(){
-  results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench())
-  names <- c("xts", "fts", "its", "zoo")
-  data.frame(results, names)
-}
-
-result <- bench()

From 3e6662d3d8e15b6c923a680f167595d8ff65ddaa Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:36:41 +0200
Subject: [PATCH 30/37] Delete zoo_bench.py

---
 bench/zoo_bench.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 bench/zoo_bench.py

diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py
deleted file mode 100644
index 74cb1952a5a2a..0000000000000
--- a/bench/zoo_bench.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from pandas import *
-from pandas.util.testing import rands
-
-n = 1000000
-# indices = Index([rands(10) for _ in xrange(n)])
-
-
-def sample(values, k):
-    sampler = np.random.permutation(len(values))
-    return values.take(sampler[:k])
-sz = 500000
-rng = np.arange(0, 10000000000000, 10000000)
-stamps = np.datetime64(datetime.now()).view('i8') + rng
-idx1 = np.sort(sample(stamps, sz))
-idx2 = np.sort(sample(stamps, sz))
-ts1 = Series(np.random.randn(sz), idx1)
-ts2 = Series(np.random.randn(sz), idx2)
-
-
-# subsample_size = 90000
-
-# x = Series(np.random.randn(100000), indices)
-# y = Series(np.random.randn(subsample_size),
-#            index=sample(indices, subsample_size))
-
-
-# lx = larry(np.random.randn(100000), [list(indices)])
-# ly = larry(np.random.randn(subsample_size), [list(y.index)])
-
-# Benchmark 1: Two 1-million length time series (int64-based index) with
-# randomly chosen timestamps
-
-# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join)
-
-# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5))
-# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10))

From 8a3411a79bfb98592211908abc8ad8721d202ae1 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:40:13 +0200
Subject: [PATCH 31/37] Delete build_27-32.bat

---
 scripts/windows_builder/build_27-32.bat | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 scripts/windows_builder/build_27-32.bat

diff --git a/scripts/windows_builder/build_27-32.bat b/scripts/windows_builder/build_27-32.bat
deleted file mode 100644
index 37eb4d436d567..0000000000000
--- a/scripts/windows_builder/build_27-32.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-@echo off
-echo "starting 27-32"
-
-setlocal EnableDelayedExpansion
-set MSSdk=1
-CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x86 /release
-set DISTUTILS_USE_SDK=1
-
-title 27-32 build
-echo "building"
-cd "c:\users\Jeff Reback\documents\github\pandas"
-C:\python27-32\python.exe setup.py build > build.27-32.log 2>&1
-
-title "installing"
-C:\python27-32\python.exe setup.py bdist --formats=wininst > install.27-32.log 2>&1
-
-echo "testing"
-C:\python27-32\scripts\nosetests -A "not slow" build\lib.win32-2.7\pandas > test.27-32.log 2>&1
-
-echo "versions"
-cd build\lib.win32-2.7
-C:\python27-32\python.exe ../../ci/print_versions.py > ../../versions.27-32.log 2>&1
-
-exit
-

From 0e8b68f64b4a309dde38ea516b752bd284f1a901 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:40:29 +0200
Subject: [PATCH 32/37] Delete build_27-64.bat

---
 scripts/windows_builder/build_27-64.bat | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 scripts/windows_builder/build_27-64.bat

diff --git a/scripts/windows_builder/build_27-64.bat b/scripts/windows_builder/build_27-64.bat
deleted file mode 100644
index e76e25d0ef39c..0000000000000
--- a/scripts/windows_builder/build_27-64.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-@echo off
-echo "starting 27-64"
-
-setlocal EnableDelayedExpansion
-set MSSdk=1
-CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x64 /release
-set DISTUTILS_USE_SDK=1
-
-title 27-64 build
-echo "building"
-cd "c:\users\Jeff Reback\documents\github\pandas"
-C:\python27-64\python.exe setup.py build > build.27-64.log 2>&1
-
-echo "installing"
-C:\python27-64\python.exe setup.py bdist --formats=wininst > install.27-64.log 2>&1
-
-echo "testing"
-C:\python27-64\scripts\nosetests -A "not slow" build\lib.win-amd64-2.7\pandas > test.27-64.log 2>&1
-
-echo "versions"
-cd build\lib.win-amd64-2.7
-C:\python27-64\python.exe ../../ci/print_versions.py > ../../versions.27-64.log 2>&1
-
-exit
-

From 9fbadcea6552738f0cc9ec10fe0670d181b206fb Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:40:43 +0200
Subject: [PATCH 33/37] Delete build_34-32.bat

---
 scripts/windows_builder/build_34-32.bat | 27 -------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 scripts/windows_builder/build_34-32.bat

diff --git a/scripts/windows_builder/build_34-32.bat b/scripts/windows_builder/build_34-32.bat
deleted file mode 100644
index 8e060e000bc8f..0000000000000
--- a/scripts/windows_builder/build_34-32.bat
+++ /dev/null
@@ -1,27 +0,0 @@
-@echo off
-echo "starting 34-32"
-
-setlocal EnableDelayedExpansion
-set MSSdk=1
-CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x86 /release
-set DISTUTILS_USE_SDK=1
-
-title 34-32 build
-echo "building"
-cd "c:\users\Jeff Reback\documents\github\pandas"
-C:\python34-32\python.exe setup.py build > build.34-32.log 2>&1
-
-echo "installing"
-C:\python34-32\python.exe setup.py bdist --formats=wininst > install.34-32.log 2>&1
-
-echo "testing"
-C:\python34-32\scripts\nosetests -A "not slow" build\lib.win32-3.4\pandas > test.34-32.log 2>&1
-
-echo "versions"
-cd build\lib.win32-3.4
-C:\python34-32\python.exe ../../ci/print_versions.py > ../../versions.34-32.log 2>&1
-
-exit
-
-
-

From 791c76f361b16505627a9fe751df8a8a6ec0d7cd Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:40:58 +0200
Subject: [PATCH 34/37] Delete build_34-64.bat

---
 scripts/windows_builder/build_34-64.bat | 27 -------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 scripts/windows_builder/build_34-64.bat

diff --git a/scripts/windows_builder/build_34-64.bat b/scripts/windows_builder/build_34-64.bat
deleted file mode 100644
index 3a8512b730346..0000000000000
--- a/scripts/windows_builder/build_34-64.bat
+++ /dev/null
@@ -1,27 +0,0 @@
-@echo off
-echo "starting 34-64"
-
-setlocal EnableDelayedExpansion
-set MSSdk=1
-CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release
-set DISTUTILS_USE_SDK=1
-
-title 34-64 build
-echo "building"
-cd "c:\users\Jeff Reback\documents\github\pandas"
-C:\python34-64\python.exe setup.py build > build.34-64.log 2>&1
-
-echo "installing"
-C:\python34-64\python.exe setup.py bdist --formats=wininst > install.34-64.log 2>&1
-
-echo "testing"
-C:\python34-64\scripts\nosetests -A "not slow" build\lib.win-amd64-3.4\pandas > test.34-64.log 2>&1
-
-echo "versions"
-cd build\lib.win-amd64-3.4
-C:\python34-64\python.exe ../../ci/print_versions.py > ../../versions.34-64.log 2>&1
-
-exit
-
-
-

From 914196115aa93b546cb2b412059232191f04c481 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:41:11 +0200
Subject: [PATCH 35/37] Delete check_and_build.bat

---
 scripts/windows_builder/check_and_build.bat | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 scripts/windows_builder/check_and_build.bat

diff --git a/scripts/windows_builder/check_and_build.bat b/scripts/windows_builder/check_and_build.bat
deleted file mode 100644
index 32be1bde1f7f3..0000000000000
--- a/scripts/windows_builder/check_and_build.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-set PYTHONPATH=c:/python27-64/lib
-c:/python27-64/python.exe c:/Builds/check_and_build.py %1 %2 %3 %4 %4 %6 %7 %8 %9

From 03cf289b215fb8fbbac31e6ae6fefd0082c7a82a Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:41:27 +0200
Subject: [PATCH 36/37] Delete check_and_build.py

---
 scripts/windows_builder/check_and_build.py | 194 ---------------------
 1 file changed, 194 deletions(-)
 delete mode 100644 scripts/windows_builder/check_and_build.py

diff --git a/scripts/windows_builder/check_and_build.py b/scripts/windows_builder/check_and_build.py
deleted file mode 100644
index 4a555e5c16b09..0000000000000
--- a/scripts/windows_builder/check_and_build.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import datetime
-import git
-import logging
-import os, re, time
-import subprocess
-import argparse
-import pysftp
-
-# parse the args
-parser = argparse.ArgumentParser(description='build, test, and install updated versions of master pandas')
-parser.add_argument('-b', '--build',
-                    help='run just this build',
-                    dest='build')
-parser.add_argument('-u', '--update',
-                    help='get a git update',
-                    dest='update',
-                    action='store_true',
-                    default=False)
-parser.add_argument('-t', '--test',
-                    help='run the tests',
-                    dest='test',
-                    action='store_true',
-                    default=False)
-parser.add_argument('-c', '--compare',
-                    help='show the last tests compare',
-                    dest='compare',
-                    action='store_true',
-                    default=False)
-parser.add_argument('-v', '--version',
-                    help='show the last versions',
-                    dest='version',
-                    action='store_true',
-                    default=False)
-parser.add_argument('-i', '--install',
-                    help='run the install',
-                    dest='install',
-                    action='store_true',
-                    default=False)
-parser.add_argument('--dry',
-                    help='dry run',
-                    dest='dry',
-                    action='store_true',
-                    default=False)
-
-args = parser.parse_args()
-dry_run = args.dry
-
-builds = ['27-32','27-64','34-32','34-64']
-base_dir = "C:\Users\Jeff Reback\Documents\GitHub\pandas"
-remote_host='pandas.pydata.org'
-username='pandas'
-password = '############'
-
-# drop python from our environment to avoid
-# passing this onto sub-processes
-env = os.environ
-del env['PYTHONPATH']
-
-# the stdout logger
-fmt = '%(asctime)s: %(message)s'
-logger = logging.getLogger('check_and_build')
-logger.setLevel(logging.DEBUG)
-stream_handler = logging.StreamHandler()
-stream_handler.setFormatter(logging.Formatter(fmt))
-logger.addHandler(stream_handler)
-
-def run_all(test=False,compare=False,install=False,version=False,build=None):
-    # run everything
-
-    for b in builds:
-        if build is not None and build != b:
-            continue
-        if test:
-            do_rebuild(b)
-        if compare or test:
-            try:
-                do_compare(b)
-            except (Exception) as e:
-                logger.info("ERROR COMPARE {0} : {1}".format(b,e))
-        if version:
-            try:
-                do_version(b)
-            except (Exception) as e:
-                logger.info("ERROR VERSION {0} : {1}".format(b,e))
-
-    if install:
-        run_install()
-
-def do_rebuild(build):
-    # trigger the rebuild
-
-    cmd = "c:/Builds/build_{0}.bat".format(build)
-    logger.info("rebuild : {0}".format(cmd))
-    p = subprocess.Popen("start /wait /min {0}".format(cmd),env=env,shell=True,close_fds=True)
-    ret = p.wait()
-
-def do_compare(build):
-    # print the test outputs
-
-    f = os.path.join(base_dir,"test.{0}.log".format(build))
-    with open(f,'r') as fh:
-        for l in fh:
-            l = l.rstrip()
-            if l.startswith('ERROR:'):
-                logger.info("{0} : {1}".format(build,l))
-            if l.startswith('Ran') or l.startswith('OK') or l.startswith('FAIL'):
-                logger.info("{0} : {1}".format(build,l))
-
-def do_version(build):
-    # print the version strings
-
-    f = os.path.join(base_dir,"versions.{0}.log".format(build))
-    with open(f,'r') as fh:
-        for l in fh:
-            l = l.rstrip()
-            logger.info("{0} : {1}".format(build,l))
-
-def do_update(is_verbose=True):
-    # update git; return True if the commit has changed
-
-    repo = git.Repo(base_dir)
-    master = repo.heads.master
-    origin = repo.remotes.origin
-    start_commit = master.commit
-
-    if is_verbose:
-        logger.info("current commit   : {0}".format(start_commit))
-
-    try:
-        origin.update()
-    except (Exception) as e:
-        logger.info("update exception : {0}".format(e))
-    try:
-        origin.pull()
-    except (Exception) as e:
-        logger.info("pull exception : {0}".format(e))
-
-    result = start_commit != master.commit
-    if result:
-        if is_verbose:
-            logger.info("commits changed : {0} -> {1}".format(start_commit,master.commit))
-    return result
-
-def run_install():
-    # send the installation binaries
-
-    repo = git.Repo(base_dir)
-    master = repo.heads.master
-    commit = master.commit
-    short_hash = str(commit)[:7]
-
-    logger.info("sending files : {0}".format(commit))
-    d = os.path.join(base_dir,"dist")
-    files = [ f for f in os.listdir(d) if re.search(short_hash,f) ]
-    srv = pysftp.Connection(host=remote_host,username=username,password=password)
-    srv.chdir("www/pandas-build/dev")
-
-    # get current files
-    remote_files = set(srv.listdir(path='.'))
-
-    for f in files:
-        if f not in remote_files:
-            logger.info("sending: {0}".format(f))
-            local = os.path.join(d,f)
-            srv.put(localpath=local)
-
-    srv.close()
-    logger.info("sending files: done")
-
-# just perform the action
-if args.update or args.test or args.compare or args.install or args.version:
-    if args.update:
-        do_update()
-    run_all(test=args.test,compare=args.compare,install=args.install,version=args.version,build=args.build)
-    exit(0)
-
-# file logging
-file_handler = logging.FileHandler("C:\Builds\logs\check_and_build.log")
-file_handler.setFormatter(logging.Formatter(fmt))
-logger.addHandler(file_handler)
-
-logger.info("start")
-
-# main loop
-while(True):
-
-    if do_update():
-        run_all(test=True,install=False)
-
-    time.sleep(60*60)
-
-logger.info("exit")
-file_handler.close()
-

From a3efe34ccfbc65af5a275ab580e9e1661e37f437 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 17 Jul 2017 13:41:42 +0200
Subject: [PATCH 37/37] Delete readme.txt

---
 scripts/windows_builder/readme.txt | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 scripts/windows_builder/readme.txt

diff --git a/scripts/windows_builder/readme.txt b/scripts/windows_builder/readme.txt
deleted file mode 100644
index 789e2a9ee0c63..0000000000000
--- a/scripts/windows_builder/readme.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-This is a collection of windows batch scripts (and a python script)
-to rebuild the binaries, test, and upload the binaries for public distribution
-upon a commit on github.
-
-Obviously requires that these be setup on windows
-Requires an install of Windows SDK 3.5 and 4.0
-Full python installs for each version with the deps
-
-Currently supporting
-
-27-32,27-64,34-32,34-64
-
-Note that 34 use the 4.0 SDK, while the other suse 3.5 SDK
-
-I installed these scripts in C:\Builds
-
-Installed libaries in C:\Installs