Question:
I have about 1.5 million documents in an AWS CloudSearch index. It is costing me too much and I wish to migrate off the service. I have been unable to see how I can download or export my documents from the index. Is it possible?
Answer:
For a similar need, I had to browse my entire CloudSearch domain (more than the 10000 limit) to generate a file.
I used a nodeJS script to handle that, like this:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
var AWS = require('aws-sdk'); var fs = require('fs'); AWS.config.update({ accessKeyId: ' region: ' }); var batchSize = 5000; //Number of item on every search... Max:10000 var compteur = 0; var result = []; var params = {query:""}; var cloudsearchdomain = new AWS.CloudSearchDomain(params); function launchSearch(theContext) { process.stdout.write('Launch AWS.CloudSearch '); if (theContext==null) { process.stdout.write('initial request ... '); } else { var current = (theContext.start/batchSize) +2 ; var totalRun = (Math.ceil(theContext.found/batchSize * 10) / 10) + 1; process.stdout.write('( ' + current + ' / ' + totalRun + ' ) ... '); } params = { query:"-aQueryStringImpossibleToFind", cursor: (theContext==null)?"initial":theContext.cursor, size:batchSize }; var forCursor = new AWS.CloudSearchDomain(params); forCursor.search(params, function(err, data) { if (err) { console.log("Failed with params :" ); console.log(err); } else { resultMessage = data; compteur = compteur + data.hits.hit.length; for(var i=0;i result.push(data.hits.hit[i] }); } } process.stdout.write(resultMessage.hits.hit.length + ' hits found.'); if (resultMessage.hits.hit.length==0) { process.stdout.write(' Done.\n\nLet\'s create thte file...\n'); writeTheFile(result); } else { process.stdout.write('\n'); var myContext = {}; myContext.cursor = resultMessage.hits.cursor; myContext.start = resultMessage.hits.start; myContext.found = resultMessage.hits.found; myContext.retrived = resultMessage.hits.hit.length; launchSearch(myContext); } }); } function writeTheFile(myResult) { fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) { if(err) { return console.log(err); } }); process.stdout.write("DONE : File '"+ process.argv[2] + "' generated ( " + compteur + " elements ).\n"); } /*Check parameters*/ if (!process.argv[2]) { //console.log(process.argv); process.stdout.write('ERROR : the output filename is expected as argumment.\n'); process.exit(); } else { launchSearch(); } |
This script has to be called from commandline : node script.js fileToCreate.json
Note : I don’t know if this works correctly on a 1.5 millions documents searchdomain. The risk I forsee is the JSON variable size. So, this script has to be adapted (maybe a file write every 100 000 documents ?).