From 812f9288ebf00155a39ef25751fabaaccfa5cc68 Mon Sep 17 00:00:00 2001
From: "Praggastis, Brenda" <Brenda.Praggastis@pnnl.gov>
Date: Mon, 22 May 2017 22:29:49 -0700
Subject: [PATCH 1/3] gh-14671 Check if usecols with type string contains a
 subset of names, if not throws an error

---
 pandas/io/parsers.py              |  6 ++++++
 pandas/tests/io/parser/usecols.py | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index aab70c8ce2cd4..5f73a2e589c8a 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1649,6 +1649,12 @@ def __init__(self, src, **kwds):
 
         if self.usecols:
             usecols = _evaluate_usecols(self.usecols, self.orig_names)
+
+            #gh-14671
+            if  (self.usecols_dtype == 'string') and \
+                (not set(usecols).issubset(self.orig_names)):
+               raise ValueError("Usecols do not match names.")
+
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 8761d1ccd3da4..85fdf802d60b7 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas.util.testing as tm
+import re
 
 from pandas import DataFrame, Index
 from pandas._libs.lib import Timestamp
@@ -475,3 +476,20 @@ def test_uneven_length_cols(self):
                               'C': [3, 5, 4, 3, 3, 7]})
         df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)
+
+    def test_raise_on_usecols_names_mismatch(self):
+        # see gh-14671
+        data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
+        usecols = ['a','b','c','d']
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]})
+        tm.assert_frame_equal(df, expected)
+
+        msg = 'Usecols do not match names'  ## from parsers.py CParserWrapper()
+        msg2 = 'is not in list' ## from parser.py _handle_usecols()
+        usecols = ['a','b','c','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), usecols=usecols)
+        usecols = ['a','b','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), usecols=usecols)

From 1968a70a4b821b0075d87c3b37e273ec876d84bf Mon Sep 17 00:00:00 2001
From: "Praggastis, Brenda" <Brenda.Praggastis@pnnl.gov>
Date: Tue, 23 May 2017 10:13:18 -0700
Subject: [PATCH 2/3] tests added for gh-14671, expected behavior of
 simultaneous use of usecols and names unclear so these tests are commented
 out

---
 pandas/tests/io/parser/usecols.py | 32 ++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 85fdf802d60b7..44c3c4dcffccd 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -478,18 +478,44 @@ def test_uneven_length_cols(self):
         tm.assert_frame_equal(df, expected)
 
     def test_raise_on_usecols_names_mismatch(self):
-        # see gh-14671
+        ## see gh-14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
+        msg = 'Usecols do not match names'  ## from parsers.py CParserWrapper()
+        msg2 = 'is not in list' ## from parser.py _handle_usecols()
+
         usecols = ['a','b','c','d']
         df = self.read_csv(StringIO(data), usecols=usecols)
         expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]})
         tm.assert_frame_equal(df, expected)
 
-        msg = 'Usecols do not match names'  ## from parsers.py CParserWrapper()
-        msg2 = 'is not in list' ## from parser.py _handle_usecols()
         usecols = ['a','b','c','f']
         with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
             self.read_csv(StringIO(data), usecols=usecols)
+
         usecols = ['a','b','f']
         with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
             self.read_csv(StringIO(data), usecols=usecols)
+
+        names = ['A', 'B', 'C', 'D']
+
+        df = self.read_csv(StringIO(data), header=0, names=names)
+        expected = DataFrame({'A': [1,5], 'B': [2,6], 'C': [3,7], 'D': [4,8]})
+        tm.assert_frame_equal(df, expected)
+
+        # usecols = ['A','C']
+        # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+        #
+        # usecols = [0,2]
+        # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+
+
+        usecols = ['A','B','C','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        usecols = ['A','B','f']
+        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+            self.read_csv(StringIO(data), names=names, usecols=usecols)

From 3418bdeb535aa82ac36dada5302ae3fb845b570d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 3 Jun 2017 20:14:43 -0500
Subject: [PATCH 3/3] Review comments

---
 doc/source/whatsnew/v0.20.2.txt   |  1 +
 pandas/io/parsers.py              |  8 +++---
 pandas/tests/io/parser/usecols.py | 45 ++++++++++++++++++-------------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
index c9486954258c8..ea92c45b7e35b 100644
--- a/doc/source/whatsnew/v0.20.2.txt
+++ b/doc/source/whatsnew/v0.20.2.txt
@@ -72,6 +72,7 @@ I/O
 ^^^
 
 - Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`)
+- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`)
 - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5f73a2e589c8a..055d6d045d2f2 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1650,10 +1650,10 @@ def __init__(self, src, **kwds):
         if self.usecols:
             usecols = _evaluate_usecols(self.usecols, self.orig_names)
 
-            #gh-14671
-            if  (self.usecols_dtype == 'string') and \
-                (not set(usecols).issubset(self.orig_names)):
-               raise ValueError("Usecols do not match names.")
+            # GH 14671
+            if (self.usecols_dtype == 'string' and
+                    not set(usecols).issubset(self.orig_names)):
+                raise ValueError("Usecols do not match names.")
 
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py
index 44c3c4dcffccd..f582e5037ca07 100644
--- a/pandas/tests/io/parser/usecols.py
+++ b/pandas/tests/io/parser/usecols.py
@@ -9,7 +9,6 @@
 
 import numpy as np
 import pandas.util.testing as tm
-import re
 
 from pandas import DataFrame, Index
 from pandas._libs.lib import Timestamp
@@ -478,44 +477,52 @@ def test_uneven_length_cols(self):
         tm.assert_frame_equal(df, expected)
 
     def test_raise_on_usecols_names_mismatch(self):
-        ## see gh-14671
+        # GH 14671
         data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
-        msg = 'Usecols do not match names'  ## from parsers.py CParserWrapper()
-        msg2 = 'is not in list' ## from parser.py _handle_usecols()
 
-        usecols = ['a','b','c','d']
+        if self.engine == 'c':
+            msg = 'Usecols do not match names'
+        else:
+            msg = 'is not in list'
+
+        usecols = ['a', 'b', 'c', 'd']
         df = self.read_csv(StringIO(data), usecols=usecols)
-        expected = DataFrame({'a': [1,5], 'b': [2,6], 'c': [3,7], 'd': [4,8]})
+        expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
+                              'd': [4, 8]})
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a','b','c','f']
-        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+        usecols = ['a', 'b', 'c', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
             self.read_csv(StringIO(data), usecols=usecols)
 
-        usecols = ['a','b','f']
-        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+        usecols = ['a', 'b', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
             self.read_csv(StringIO(data), usecols=usecols)
 
         names = ['A', 'B', 'C', 'D']
 
         df = self.read_csv(StringIO(data), header=0, names=names)
-        expected = DataFrame({'A': [1,5], 'B': [2,6], 'C': [3,7], 'D': [4,8]})
+        expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
+                              'D': [4, 8]})
         tm.assert_frame_equal(df, expected)
 
+        # TODO: https://github.com/pandas-dev/pandas/issues/16469
         # usecols = ['A','C']
-        # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
         # expected = DataFrame({'A': [1,5], 'C': [3,7]})
         # tm.assert_frame_equal(df, expected)
         #
         # usecols = [0,2]
-        # df = self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
         # expected = DataFrame({'A': [1,5], 'C': [3,7]})
         # tm.assert_frame_equal(df, expected)
 
-
-        usecols = ['A','B','C','f']
-        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
-            self.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
-        usecols = ['A','B','f']
-        with tm.assert_raises_regex(ValueError, re.compile("'" + msg + '||' + msg2 + "'")):
+        usecols = ['A', 'B', 'C', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), header=0, names=names,
+                          usecols=usecols)
+        usecols = ['A', 'B', 'f']
+        with tm.assert_raises_regex(ValueError, msg):
             self.read_csv(StringIO(data), names=names, usecols=usecols)