Splitting a large CSV file into multiple smaller files

Just this past weekend I needed to migrate a large amount of data from a regular Glide Table to a Big Table. Glide conveniently provides import/export options directly from the Data Editor to help with this. However the file size of my exported data came out at a bit over 25MB, and the maximum size file you can import into a Big Table is 4.8MB. So my CSV file needed to be split up into smaller bite sized chunks that Glide could handle.

As this is something I can see I’ll probably need to do again in the future, I decided to write some code to take care of the job, and to share it in case others might find it useful.

To use it:

  • Export the data from your regular Glide Table
  • Import the CSV file into a Google Sheet
  • Add the code below via Extensions → App Script
  • Adjust the following constants at the start of the code:
const MAX_FILE_SIZE = 4.5; // Megabytes.
const SHEET_NAME = 'Sheet1'; // The name of the sheet that holds your imported data
  • Save, and execute the splitSheet() function

When the code runs, it will do the following:

  • Firstly it will create a new sheet named “Links”. It will add a series of download links to this sheet, one for each new smaller sheet created.
  • Then it will read all your data, and split it up into multiple sheets, where each sheet when exported should have a file size approximately the same as specified in MAX_FILE_SIZE
  • Once it’s finished, you can click on each link to download the CSV files, and import them into Glide.

Note that expected file size is guestimated based on total character count, so it won’t be exact.

Here is the code:
// Adjust the below to suit
const MAX_FILE_SIZE = 4.5; // Megabytes. Expected download file size is estimated based on character count.
const SHEET_NAME = 'Sheet1'; // The name of the sheet that holds your imported data

// No need to edit anything below here.
const SS = SpreadsheetApp.getActiveSpreadsheet();
const SSID = SS.getId();
const LINKS_SHEET_NAME = 'Links';
const LINKS_SHEET = createOrClearLinksSheet();
  
function splitSheet() {
  const sheet = SS.getSheetByName(SHEET_NAME);
  const data = sheet.getDataRange().getValues();
  const headerRow = data.shift();
  const numCols = headerRow.length;
  let batchNumber = 0;
  const maxCharCount = MAX_FILE_SIZE * 1000000;
  while (data.length > 0) {
    let thisBatchCharCount = 0;
    let thisBatch = [];
    thisBatch.push(headerRow);
    while (thisBatchCharCount < maxCharCount && data.length > 0) {
      let nextRow = data.shift();
      if (typeof nextRow !== 'undefined') { // Skip any empty rows.
        let thisRowCharCount = nextRow.reduce((accumulator, currentValue) => accumulator + currentValue.length, 0);
        thisBatchCharCount += thisRowCharCount;
        thisBatch.push(nextRow);
      }
    }
    batchNumber++;
    writeBatch(batchNumber, numCols, thisBatch);
    thisBatchCharCount = 0;
  }
}

function writeBatch(batchNumber, numCols, data) {
  let batchSheetName = `${SHEET_NAME} - ${batchNumber}`;
  let batchSheet;
  try {
    batchSheet = SS.insertSheet(batchSheetName);
  }
  catch (e) {
    // Already exists, delete it and create a new one
    console.log("%s already exists, deleting", batchSheetName);
    batchSheet = SS.getSheetByName(batchSheetName);
    SS.deleteSheet(batchSheet);
    batchSheet = SS.insertSheet(batchSheetName);
  }
  batchSheet.getRange(1, 1, data.length, numCols).setValues(data);
  let gid = batchSheet.getSheetId();
  let url = `https://docs.google.com/spreadsheets/d/${SSID}/export?format=csv&gid=${gid}`;
  writeDownloadLink(url, batchSheetName);
  console.log("%s done, added %s rows", batchSheetName, data.length);
}

function writeDownloadLink(url, sheetName) {
  const nextRow = LINKS_SHEET.getLastRow() + 1;
  LINKS_SHEET.getRange(nextRow, 1, 1, 2).setValues([[sheetName, url]]);
}

function createOrClearLinksSheet() {
  let sheet;
  try {
    sheet = SS.insertSheet(LINKS_SHEET_NAME);
    sheet.getRange(1, 1, 1, 2).setValues([['Sheet Name', 'Download Link']]);
    return sheet;
  }
  catch (e) {
    // Already exists, clear it
    sheet = SS.getSheetByName(LINKS_SHEET_NAME);
    let lastRow = sheet.getLastRow();
    if (lastRow > 1) {
      sheet.getRange(2, 1, lastRow-1, sheet.getLastColumn()).clearContent();
    }
    return sheet;
  }
}

Enjoy :slight_smile:

7 Likes

Very handy, thank you!