bitbang_spi: Add half-duplex optimizations

Currently, the core of bitbang_spi is a full-duplex SPI loop but in
practice this code is only ever used half-duplex. Spliting this code
into two half duplex loops allows us to optimize performance by reducing
communications and/or CPU pipeline stalls.

The speed up varies depending on how much the overhead of
getting/setting pins dominates execution time. For a USB bit bang driver
running on a 7th generation Core i5, the time to probe drops from ~7.7
seconds to ~6.7 seconds when this patch is applied.

Change-Id: I33b9f363716f651146c09113bda5fffe53b16738
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-on: https://review.coreboot.org/26947
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Nico Huber <nico.h@gmx.de>
diff --git a/bitbang_spi.c b/bitbang_spi.c
index 2c7a3f1..4b39164 100644
--- a/bitbang_spi.c
+++ b/bitbang_spi.c
@@ -119,14 +119,16 @@
 	return 0;
 }
 
-static uint8_t bitbang_spi_rw_byte(const struct bitbang_spi_master *master,
-				   uint8_t val)
+static uint8_t bitbang_spi_read_byte(const struct bitbang_spi_master *master)
 {
 	uint8_t ret = 0;
 	int i;
 
 	for (i = 7; i >= 0; i--) {
-		bitbang_spi_set_sck_set_mosi(master, 0, (val >> i) & 1);
+		if (i == 0)
+			bitbang_spi_set_sck_set_mosi(master, 0, 0);
+		else
+			bitbang_spi_set_sck(master, 0);
 		programmer_delay(master->half_period);
 		ret <<= 1;
 		ret |= bitbang_spi_set_sck_get_miso(master, 1);
@@ -135,6 +137,18 @@
 	return ret;
 }
 
+static void bitbang_spi_write_byte(const struct bitbang_spi_master *master, uint8_t val)
+{
+	int i;
+
+	for (i = 7; i >= 0; i--) {
+		bitbang_spi_set_sck_set_mosi(master, 0, (val >> i) & 1);
+		programmer_delay(master->half_period);
+		bitbang_spi_set_sck(master, 1);
+		programmer_delay(master->half_period);
+	}
+}
+
 static int bitbang_spi_send_command(struct flashctx *flash,
 				    unsigned int writecnt, unsigned int readcnt,
 				    const unsigned char *writearr,
@@ -150,9 +164,9 @@
 	bitbang_spi_request_bus(master);
 	bitbang_spi_set_cs(master, 0);
 	for (i = 0; i < writecnt; i++)
-		bitbang_spi_rw_byte(master, writearr[i]);
+		bitbang_spi_write_byte(master, writearr[i]);
 	for (i = 0; i < readcnt; i++)
-		readarr[i] = bitbang_spi_rw_byte(master, 0);
+		readarr[i] = bitbang_spi_read_byte(master);
 
 	bitbang_spi_set_sck(master, 0);
 	programmer_delay(master->half_period);